duckrun 0.1.6.1__py3-none-any.whl → 0.1.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -309,9 +309,6 @@ class Duckrun:
309
309
  print(f"✅ Successfully attached {attached_count}/{len(tables)} tables")
310
310
  print(f"{'='*60}\n")
311
311
 
312
- print("Available views in DuckDB:")
313
- self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory' ORDER BY name").show()
314
-
315
312
  if self.scan_all_schemas:
316
313
  print(f"\n💡 Note: Tables are prefixed with schema (e.g., dbo_tablename)")
317
314
  print(f" Default schema for operations: {self.schema}\n")
@@ -509,6 +506,246 @@ class Duckrun:
509
506
  print('='*60)
510
507
  return True
511
508
 
509
+ def copy(self, local_folder: str, remote_folder: str,
510
+ file_extensions: Optional[List[str]] = None,
511
+ overwrite: bool = False) -> bool:
512
+ """
513
+ Copy files from a local folder to OneLake Files section.
514
+
515
+ Args:
516
+ local_folder: Path to local folder containing files to upload
517
+ remote_folder: Target subfolder path in OneLake Files (e.g., "reports/daily") - REQUIRED
518
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
519
+ overwrite: Whether to overwrite existing files (default: False)
520
+
521
+ Returns:
522
+ True if all files uploaded successfully, False otherwise
523
+
524
+ Examples:
525
+ # Upload all files from local folder to a target folder
526
+ dr.copy("./local_data", "uploaded_data")
527
+
528
+ # Upload only CSV files to a specific subfolder
529
+ dr.copy("./reports", "daily_reports", ['.csv'])
530
+
531
+ # Upload with overwrite enabled
532
+ dr.copy("./backup", "backups", overwrite=True)
533
+ """
534
+ if not os.path.exists(local_folder):
535
+ print(f"❌ Local folder not found: {local_folder}")
536
+ return False
537
+
538
+ if not os.path.isdir(local_folder):
539
+ print(f"❌ Path is not a directory: {local_folder}")
540
+ return False
541
+
542
+ # Get Azure token
543
+ token = self._get_storage_token()
544
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
545
+ print("Getting Azure token for file upload...")
546
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
547
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
548
+ token_obj = credential.get_token("https://storage.azure.com/.default")
549
+ token = token_obj.token
550
+ os.environ["AZURE_STORAGE_TOKEN"] = token
551
+
552
+ # Setup OneLake Files URL (not Tables)
553
+ files_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
554
+ store = AzureStore.from_url(files_base_url, bearer_token=token)
555
+
556
+ # Collect files to upload
557
+ files_to_upload = []
558
+ for root, dirs, files in os.walk(local_folder):
559
+ for file in files:
560
+ local_file_path = os.path.join(root, file)
561
+
562
+ # Filter by extensions if specified
563
+ if file_extensions:
564
+ _, ext = os.path.splitext(file)
565
+ if ext.lower() not in [e.lower() for e in file_extensions]:
566
+ continue
567
+
568
+ # Calculate relative path from local_folder
569
+ rel_path = os.path.relpath(local_file_path, local_folder)
570
+
571
+ # Build remote path in OneLake Files (remote_folder is now mandatory)
572
+ remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
573
+
574
+ files_to_upload.append((local_file_path, remote_path))
575
+
576
+ if not files_to_upload:
577
+ print(f"No files found to upload in {local_folder}")
578
+ if file_extensions:
579
+ print(f" (filtered by extensions: {file_extensions})")
580
+ return True
581
+
582
+ print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
583
+ print(f" Target folder: {remote_folder}")
584
+
585
+ uploaded_count = 0
586
+ failed_count = 0
587
+
588
+ for local_path, remote_path in files_to_upload:
589
+ try:
590
+ # Check if file exists (if not overwriting)
591
+ if not overwrite:
592
+ try:
593
+ obs.head(store, remote_path)
594
+ print(f" ⏭ Skipped (exists): {remote_path}")
595
+ continue
596
+ except Exception:
597
+ # File doesn't exist, proceed with upload
598
+ pass
599
+
600
+ # Read local file
601
+ with open(local_path, 'rb') as f:
602
+ file_data = f.read()
603
+
604
+ # Upload to OneLake Files
605
+ obs.put(store, remote_path, file_data)
606
+
607
+ file_size = len(file_data)
608
+ size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
609
+ size_unit = "MB" if file_size > 1024*1024 else "KB"
610
+
611
+ print(f" ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
612
+ uploaded_count += 1
613
+
614
+ except Exception as e:
615
+ print(f" ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
616
+ failed_count += 1
617
+
618
+ print(f"\n{'='*60}")
619
+ if failed_count == 0:
620
+ print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
621
+ else:
622
+ print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
623
+ print(f"{'='*60}")
624
+
625
+ return failed_count == 0
626
+
627
+ def download(self, remote_folder: str = "", local_folder: str = "./downloaded_files",
628
+ file_extensions: Optional[List[str]] = None,
629
+ overwrite: bool = False) -> bool:
630
+ """
631
+ Download files from OneLake Files section to a local folder.
632
+
633
+ Args:
634
+ remote_folder: Optional subfolder path in OneLake Files to download from
635
+ local_folder: Local folder path to download files to (default: "./downloaded_files")
636
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
637
+ overwrite: Whether to overwrite existing local files (default: False)
638
+
639
+ Returns:
640
+ True if all files downloaded successfully, False otherwise
641
+
642
+ Examples:
643
+ # Download all files from OneLake Files root
644
+ dr.download_from_files()
645
+
646
+ # Download only CSV files from a specific subfolder
647
+ dr.download_from_files("daily_reports", "./reports", ['.csv'])
648
+ """
649
+ # Get Azure token
650
+ token = self._get_storage_token()
651
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
652
+ print("Getting Azure token for file download...")
653
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
654
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
655
+ token_obj = credential.get_token("https://storage.azure.com/.default")
656
+ token = token_obj.token
657
+ os.environ["AZURE_STORAGE_TOKEN"] = token
658
+
659
+ # Setup OneLake Files URL (not Tables)
660
+ files_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
661
+ store = AzureStore.from_url(files_base_url, bearer_token=token)
662
+
663
+ # Create local directory
664
+ os.makedirs(local_folder, exist_ok=True)
665
+
666
+ # List files in OneLake Files
667
+ print(f"📁 Discovering files in OneLake Files...")
668
+ if remote_folder:
669
+ print(f" Source folder: {remote_folder}")
670
+ prefix = f"{remote_folder.strip('/')}/"
671
+ else:
672
+ prefix = ""
673
+
674
+ try:
675
+ list_stream = obs.list(store, prefix=prefix)
676
+ files_to_download = []
677
+
678
+ for batch in list_stream:
679
+ for obj in batch:
680
+ remote_path = obj["path"]
681
+
682
+ # Filter by extensions if specified
683
+ if file_extensions:
684
+ _, ext = os.path.splitext(remote_path)
685
+ if ext.lower() not in [e.lower() for e in file_extensions]:
686
+ continue
687
+
688
+ # Calculate local path
689
+ if remote_folder:
690
+ rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
691
+ else:
692
+ rel_path = remote_path
693
+
694
+ local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
695
+ files_to_download.append((remote_path, local_path))
696
+
697
+ if not files_to_download:
698
+ print(f"No files found to download")
699
+ if file_extensions:
700
+ print(f" (filtered by extensions: {file_extensions})")
701
+ return True
702
+
703
+ print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
704
+
705
+ downloaded_count = 0
706
+ failed_count = 0
707
+
708
+ for remote_path, local_path in files_to_download:
709
+ try:
710
+ # Check if local file exists (if not overwriting)
711
+ if not overwrite and os.path.exists(local_path):
712
+ print(f" ⏭ Skipped (exists): {local_path}")
713
+ continue
714
+
715
+ # Ensure local directory exists
716
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
717
+
718
+ # Download file
719
+ data = obs.get(store, remote_path).bytes()
720
+
721
+ # Write to local file
722
+ with open(local_path, 'wb') as f:
723
+ f.write(data)
724
+
725
+ file_size = len(data)
726
+ size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
727
+ size_unit = "MB" if file_size > 1024*1024 else "KB"
728
+
729
+ print(f" ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
730
+ downloaded_count += 1
731
+
732
+ except Exception as e:
733
+ print(f" ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
734
+ failed_count += 1
735
+
736
+ print(f"\n{'='*60}")
737
+ if failed_count == 0:
738
+ print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
739
+ else:
740
+ print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
741
+ print(f"{'='*60}")
742
+
743
+ return failed_count == 0
744
+
745
+ except Exception as e:
746
+ print(f"❌ Error listing files from OneLake: {e}")
747
+ return False
748
+
512
749
  def sql(self, query: str):
513
750
  """
514
751
  Execute raw SQL query with Spark-style write API.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.1.6.1
3
+ Version: 0.1.6.3
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -35,6 +35,11 @@ Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and
35
35
  ```bash
36
36
  pip install duckrun
37
37
  ```
38
+ for local usage, Note: When running locally, your internet speed will be the main bottleneck.
39
+
40
+ ```bash
41
+ pip install duckrun[local]
42
+ ```
38
43
 
39
44
  ## Quick Start
40
45
 
@@ -53,6 +58,10 @@ con.sql("SELECT * FROM my_table LIMIT 10").show()
53
58
 
54
59
  # Write to Delta tables (Spark-style API)
55
60
  con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
61
+
62
+ # Upload/download files to/from OneLake Files
63
+ con.copy("./local_folder", "target_folder") # Upload files
64
+ con.download("target_folder", "./downloaded") # Download files
56
65
  ```
57
66
 
58
67
  That's it! No `sql_folder` needed for data exploration.
@@ -122,7 +131,38 @@ con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
122
131
 
123
132
  **Note:** `.format("delta")` is optional - Delta is the default format!
124
133
 
125
- ### 2. Pipeline Orchestration
134
+ ### 2. File Management (OneLake Files)
135
+
136
+ Upload and download files to/from OneLake Files section (not Delta tables):
137
+
138
+ ```python
139
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
140
+
141
+ # Upload files to OneLake Files (remote_folder is required)
142
+ con.copy("./local_data", "uploaded_data")
143
+
144
+ # Upload only specific file types
145
+ con.copy("./reports", "daily_reports", ['.csv', '.parquet'])
146
+
147
+ # Upload with overwrite enabled (default is False for safety)
148
+ con.copy("./backup", "backups", overwrite=True)
149
+
150
+ # Download files from OneLake Files
151
+ con.download("uploaded_data", "./downloaded")
152
+
153
+ # Download only CSV files from a specific folder
154
+ con.download("daily_reports", "./reports", ['.csv'])
155
+ ```
156
+
157
+ **Key Features:**
158
+ - ✅ **Files go to OneLake Files section** (not Delta Tables)
159
+ - ✅ **`remote_folder` parameter is required** for uploads (prevents accidental uploads)
160
+ - ✅ **`overwrite=False` by default** (safer - prevents accidental overwrites)
161
+ - ✅ **File extension filtering** (e.g., only `.csv` or `.parquet` files)
162
+ - ✅ **Preserves folder structure** during upload/download
163
+ - ✅ **Progress reporting** with file sizes and upload status
164
+
165
+ ### 3. Pipeline Orchestration
126
166
 
127
167
  For production workflows with reusable SQL and Python tasks:
128
168
 
@@ -281,6 +321,63 @@ con = duckrun.connect(
281
321
  )
282
322
  ```
283
323
 
324
+ ## File Management API Reference
325
+
326
+ ### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
327
+
328
+ Upload files from a local folder to OneLake Files section.
329
+
330
+ **Parameters:**
331
+ - `local_folder` (str): Path to local folder containing files to upload
332
+ - `remote_folder` (str): **Required** target folder path in OneLake Files
333
+ - `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
334
+ - `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
335
+
336
+ **Returns:** `True` if all files uploaded successfully, `False` otherwise
337
+
338
+ **Examples:**
339
+ ```python
340
+ # Upload all files to a target folder
341
+ con.copy("./data", "processed_data")
342
+
343
+ # Upload only CSV and Parquet files
344
+ con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
345
+
346
+ # Upload with overwrite enabled
347
+ con.copy("./backup", "daily_backup", overwrite=True)
348
+ ```
349
+
350
+ ### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
351
+
352
+ Download files from OneLake Files section to a local folder.
353
+
354
+ **Parameters:**
355
+ - `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
356
+ - `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
357
+ - `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
358
+ - `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
359
+
360
+ **Returns:** `True` if all files downloaded successfully, `False` otherwise
361
+
362
+ **Examples:**
363
+ ```python
364
+ # Download all files from OneLake Files root
365
+ con.download()
366
+
367
+ # Download from specific folder
368
+ con.download("processed_data", "./local_data")
369
+
370
+ # Download only JSON files
371
+ con.download("config", "./configs", ['.json'])
372
+ ```
373
+
374
+ **Important Notes:**
375
+ - Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
376
+ - The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
377
+ - Both methods default to `overwrite=False` for safety
378
+ - Folder structure is preserved during upload/download operations
379
+ - Progress is reported with file names, sizes, and upload/download status
380
+
284
381
  ## Complete Example
285
382
 
286
383
  ```python
@@ -289,7 +386,10 @@ import duckrun
289
386
  # Connect (specify schema for best performance)
290
387
  con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
291
388
 
292
- # Pipeline with mixed tasks
389
+ # 1. Upload raw data files to OneLake Files
390
+ con.copy("./raw_data", "raw_uploads", ['.csv', '.json'])
391
+
392
+ # 2. Pipeline with mixed tasks
293
393
  pipeline = [
294
394
  # Download raw data (Python)
295
395
  ('fetch_api_data', ('https://api.example.com/sales', 'raw')),
@@ -304,20 +404,30 @@ pipeline = [
304
404
  ('sales_history', 'append')
305
405
  ]
306
406
 
307
- # Run
407
+ # Run pipeline
308
408
  success = con.run(pipeline)
309
409
 
310
- # Explore results
410
+ # 3. Explore results using DuckDB
311
411
  con.sql("SELECT * FROM regional_summary").show()
312
412
 
313
- # Export to new table
413
+ # 4. Export to new Delta table
314
414
  con.sql("""
315
415
  SELECT region, SUM(total) as grand_total
316
416
  FROM regional_summary
317
417
  GROUP BY region
318
418
  """).write.mode("overwrite").saveAsTable("region_totals")
419
+
420
+ # 5. Download processed files for external systems
421
+ con.download("processed_reports", "./exports", ['.csv'])
319
422
  ```
320
423
 
424
+ **This example demonstrates:**
425
+ - 📁 **File uploads** to OneLake Files section
426
+ - 🔄 **Pipeline orchestration** with SQL and Python tasks
427
+ - ⚡ **Fast data exploration** with DuckDB
428
+ - 💾 **Delta table creation** with Spark-style API
429
+ - 📤 **File downloads** from OneLake Files
430
+
321
431
  ## How It Works
322
432
 
323
433
  1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=CT2NH5hCLsv4uB5zH3VxTuCVQy0nWkPBG-cICLPhG_8,34245
3
+ duckrun-0.1.6.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
+ duckrun-0.1.6.3.dist-info/METADATA,sha256=ny5DcRSU1B4SdHdJqHCYk0-hNo9-zqFABqMY9ulAVNk,13595
5
+ duckrun-0.1.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.1.6.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.1.6.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=A5UdhpdEE9Wzje5d16c0ejTWn24zy5LCaoX6OghO8Us,23352
3
- duckrun-0.1.6.1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.1.6.1.dist-info/METADATA,sha256=oHc38InTVr48Hp2mER4tbFL0RkWMEFXqg48OPYTk9qk,9358
5
- duckrun-0.1.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.1.6.1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.1.6.1.dist-info/RECORD,,