PyPI - duckrun - Versions diffs - 0.1.6.2__py3-none-any.whl → 0.1.6.3__py3-none-any.whl - Mend

duckrun 0.1.6.2py3-none-any.whl → 0.1.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

duckrun/core.py CHANGED Viewed

@@ -506,6 +506,246 @@ class Duckrun:
         print('='*60)
         return True
+    def copy(self, local_folder: str, remote_folder: str,
+             file_extensions: Optional[List[str]] = None,
+             overwrite: bool = False) -> bool:
+        """
+        Copy files from a local folder to OneLake Files section.
+        Args:
+            local_folder: Path to local folder containing files to upload
+            remote_folder: Target subfolder path in OneLake Files (e.g., "reports/daily") - REQUIRED
+            file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
+            overwrite: Whether to overwrite existing files (default: False)
+        Returns:
+            True if all files uploaded successfully, False otherwise
+        Examples:
+            # Upload all files from local folder to a target folder
+            dr.copy("./local_data", "uploaded_data")
+            # Upload only CSV files to a specific subfolder
+            dr.copy("./reports", "daily_reports", ['.csv'])
+            # Upload with overwrite enabled
+            dr.copy("./backup", "backups", overwrite=True)
+        """
+        if not os.path.exists(local_folder):
+            print(f"❌ Local folder not found: {local_folder}")
+            return False
+        if not os.path.isdir(local_folder):
+            print(f"❌ Path is not a directory: {local_folder}")
+            return False
+        # Get Azure token
+        token = self._get_storage_token()
+        if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
+            print("Getting Azure token for file upload...")
+            from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
+            credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
+            token_obj = credential.get_token("https://storage.azure.com/.default")
+            token = token_obj.token
+            os.environ["AZURE_STORAGE_TOKEN"] = token
+        # Setup OneLake Files URL (not Tables)
+        files_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
+        store = AzureStore.from_url(files_base_url, bearer_token=token)
+        # Collect files to upload
+        files_to_upload = []
+        for root, dirs, files in os.walk(local_folder):
+            for file in files:
+                local_file_path = os.path.join(root, file)
+                # Filter by extensions if specified
+                if file_extensions:
+                    _, ext = os.path.splitext(file)
+                    if ext.lower() not in [e.lower() for e in file_extensions]:
+                        continue
+                # Calculate relative path from local_folder
+                rel_path = os.path.relpath(local_file_path, local_folder)
+                # Build remote path in OneLake Files (remote_folder is now mandatory)
+                remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
+                files_to_upload.append((local_file_path, remote_path))
+        if not files_to_upload:
+            print(f"No files found to upload in {local_folder}")
+            if file_extensions:
+                print(f"  (filtered by extensions: {file_extensions})")
+            return True
+        print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
+        print(f"   Target folder: {remote_folder}")
+        uploaded_count = 0
+        failed_count = 0
+        for local_path, remote_path in files_to_upload:
+            try:
+                # Check if file exists (if not overwriting)
+                if not overwrite:
+                    try:
+                        obs.head(store, remote_path)
+                        print(f"  ⏭ Skipped (exists): {remote_path}")
+                        continue
+                    except Exception:
+                        # File doesn't exist, proceed with upload
+                        pass
+                # Read local file
+                with open(local_path, 'rb') as f:
+                    file_data = f.read()
+                # Upload to OneLake Files
+                obs.put(store, remote_path, file_data)
+                file_size = len(file_data)
+                size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
+                size_unit = "MB" if file_size > 1024*1024 else "KB"
+                print(f"  ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
+                uploaded_count += 1
+            except Exception as e:
+                print(f"  ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
+                failed_count += 1
+        print(f"\n{'='*60}")
+        if failed_count == 0:
+            print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
+        else:
+            print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
+        print(f"{'='*60}")
+        return failed_count == 0
+    def download(self, remote_folder: str = "", local_folder: str = "./downloaded_files",
+                 file_extensions: Optional[List[str]] = None,
+                 overwrite: bool = False) -> bool:
+        """
+        Download files from OneLake Files section to a local folder.
+        Args:
+            remote_folder: Optional subfolder path in OneLake Files to download from
+            local_folder: Local folder path to download files to (default: "./downloaded_files")
+            file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
+            overwrite: Whether to overwrite existing local files (default: False)
+        Returns:
+            True if all files downloaded successfully, False otherwise
+        Examples:
+            # Download all files from OneLake Files root
+            dr.download_from_files()
+            # Download only CSV files from a specific subfolder
+            dr.download_from_files("daily_reports", "./reports", ['.csv'])
+        """
+        # Get Azure token
+        token = self._get_storage_token()
+        if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
+            print("Getting Azure token for file download...")
+            from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
+            credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
+            token_obj = credential.get_token("https://storage.azure.com/.default")
+            token = token_obj.token
+            os.environ["AZURE_STORAGE_TOKEN"] = token
+        # Setup OneLake Files URL (not Tables)
+        files_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
+        store = AzureStore.from_url(files_base_url, bearer_token=token)
+        # Create local directory
+        os.makedirs(local_folder, exist_ok=True)
+        # List files in OneLake Files
+        print(f"📁 Discovering files in OneLake Files...")
+        if remote_folder:
+            print(f"   Source folder: {remote_folder}")
+            prefix = f"{remote_folder.strip('/')}/"
+        else:
+            prefix = ""
+        try:
+            list_stream = obs.list(store, prefix=prefix)
+            files_to_download = []
+            for batch in list_stream:
+                for obj in batch:
+                    remote_path = obj["path"]
+                    # Filter by extensions if specified
+                    if file_extensions:
+                        _, ext = os.path.splitext(remote_path)
+                        if ext.lower() not in [e.lower() for e in file_extensions]:
+                            continue
+                    # Calculate local path
+                    if remote_folder:
+                        rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
+                    else:
+                        rel_path = remote_path
+                    local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
+                    files_to_download.append((remote_path, local_path))
+            if not files_to_download:
+                print(f"No files found to download")
+                if file_extensions:
+                    print(f"  (filtered by extensions: {file_extensions})")
+                return True
+            print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
+            downloaded_count = 0
+            failed_count = 0
+            for remote_path, local_path in files_to_download:
+                try:
+                    # Check if local file exists (if not overwriting)
+                    if not overwrite and os.path.exists(local_path):
+                        print(f"  ⏭ Skipped (exists): {local_path}")
+                        continue
+                    # Ensure local directory exists
+                    os.makedirs(os.path.dirname(local_path), exist_ok=True)
+                    # Download file
+                    data = obs.get(store, remote_path).bytes()
+                    # Write to local file
+                    with open(local_path, 'wb') as f:
+                        f.write(data)
+                    file_size = len(data)
+                    size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
+                    size_unit = "MB" if file_size > 1024*1024 else "KB"
+                    print(f"  ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
+                    downloaded_count += 1
+                except Exception as e:
+                    print(f"  ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
+                    failed_count += 1
+            print(f"\n{'='*60}")
+            if failed_count == 0:
+                print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
+            else:
+                print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
+            print(f"{'='*60}")
+            return failed_count == 0
+        except Exception as e:
+            print(f"❌ Error listing files from OneLake: {e}")
+            return False
     def sql(self, query: str):
         """
         Execute raw SQL query with Spark-style write API.

{duckrun-0.1.6.2.dist-info → duckrun-0.1.6.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.1.6.2
+Version: 0.1.6.3
 Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
 Author: mim
 License: MIT
@@ -58,6 +58,10 @@ con.sql("SELECT * FROM my_table LIMIT 10").show()
 # Write to Delta tables (Spark-style API)
 con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
+# Upload/download files to/from OneLake Files
+con.copy("./local_folder", "target_folder")  # Upload files
+con.download("target_folder", "./downloaded")  # Download files
 ```
 That's it! No `sql_folder` needed for data exploration.
@@ -127,7 +131,38 @@ con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
 **Note:** `.format("delta")` is optional - Delta is the default format!
-### 2. Pipeline Orchestration
+### 2. File Management (OneLake Files)
+Upload and download files to/from OneLake Files section (not Delta tables):
+```python
+con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
+# Upload files to OneLake Files (remote_folder is required)
+con.copy("./local_data", "uploaded_data")
+# Upload only specific file types
+con.copy("./reports", "daily_reports", ['.csv', '.parquet'])
+# Upload with overwrite enabled (default is False for safety)
+con.copy("./backup", "backups", overwrite=True)
+# Download files from OneLake Files
+con.download("uploaded_data", "./downloaded")
+# Download only CSV files from a specific folder
+con.download("daily_reports", "./reports", ['.csv'])
+```
+**Key Features:**
+- ✅ **Files go to OneLake Files section** (not Delta Tables)
+- ✅ **`remote_folder` parameter is required** for uploads (prevents accidental uploads)
+- ✅ **`overwrite=False` by default** (safer - prevents accidental overwrites)
+- ✅ **File extension filtering** (e.g., only `.csv` or `.parquet` files)
+- ✅ **Preserves folder structure** during upload/download
+- ✅ **Progress reporting** with file sizes and upload status
+### 3. Pipeline Orchestration
 For production workflows with reusable SQL and Python tasks:
@@ -286,6 +321,63 @@ con = duckrun.connect(
 )
 ```
+## File Management API Reference
+### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
+Upload files from a local folder to OneLake Files section.
+**Parameters:**
+- `local_folder` (str): Path to local folder containing files to upload
+- `remote_folder` (str): **Required** target folder path in OneLake Files
+- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
+- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
+**Returns:** `True` if all files uploaded successfully, `False` otherwise
+**Examples:**
+```python
+# Upload all files to a target folder
+con.copy("./data", "processed_data")
+# Upload only CSV and Parquet files
+con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
+# Upload with overwrite enabled
+con.copy("./backup", "daily_backup", overwrite=True)
+```
+### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
+Download files from OneLake Files section to a local folder.
+**Parameters:**
+- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
+- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
+- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
+- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
+**Returns:** `True` if all files downloaded successfully, `False` otherwise
+**Examples:**
+```python
+# Download all files from OneLake Files root
+con.download()
+# Download from specific folder
+con.download("processed_data", "./local_data")
+# Download only JSON files
+con.download("config", "./configs", ['.json'])
+```
+**Important Notes:**
+- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
+- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
+- Both methods default to `overwrite=False` for safety
+- Folder structure is preserved during upload/download operations
+- Progress is reported with file names, sizes, and upload/download status
 ## Complete Example
 ```python
@@ -294,7 +386,10 @@ import duckrun
 # Connect (specify schema for best performance)
 con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
-# Pipeline with mixed tasks
+# 1. Upload raw data files to OneLake Files
+con.copy("./raw_data", "raw_uploads", ['.csv', '.json'])
+# 2. Pipeline with mixed tasks
 pipeline = [
     # Download raw data (Python)
     ('fetch_api_data', ('https://api.example.com/sales', 'raw')),
@@ -309,20 +404,30 @@ pipeline = [
     ('sales_history', 'append')
 ]
-# Run
+# Run pipeline
 success = con.run(pipeline)
-# Explore results
+# 3. Explore results using DuckDB
 con.sql("SELECT * FROM regional_summary").show()
-# Export to new table
+# 4. Export to new Delta table
 con.sql("""
     SELECT region, SUM(total) as grand_total
     FROM regional_summary
     GROUP BY region
 """).write.mode("overwrite").saveAsTable("region_totals")
+# 5. Download processed files for external systems
+con.download("processed_reports", "./exports", ['.csv'])
 ```
+**This example demonstrates:**
+- 📁 **File uploads** to OneLake Files section
+- 🔄 **Pipeline orchestration** with SQL and Python tasks
+- ⚡ **Fast data exploration** with DuckDB
+- 💾 **Delta table creation** with Spark-style API
+- 📤 **File downloads** from OneLake Files
 ## How It Works
 1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication

duckrun-0.1.6.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
+duckrun/core.py,sha256=CT2NH5hCLsv4uB5zH3VxTuCVQy0nWkPBG-cICLPhG_8,34245
+duckrun-0.1.6.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
+duckrun-0.1.6.3.dist-info/METADATA,sha256=ny5DcRSU1B4SdHdJqHCYk0-hNo9-zqFABqMY9ulAVNk,13595
+duckrun-0.1.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+duckrun-0.1.6.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
+duckrun-0.1.6.3.dist-info/RECORD,,

duckrun-0.1.6.2.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
-duckrun/core.py,sha256=_18GjaaT_CqhtivyDQuLIQx5UUuUIZNBMK9nBQgavXc,23180
-duckrun-0.1.6.2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
-duckrun-0.1.6.2.dist-info/METADATA,sha256=dYy1d8V2yq2JwqkLXwJC8iBLMP6UbbFm9ZGHsBJLGuY,9497
-duckrun-0.1.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-duckrun-0.1.6.2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
-duckrun-0.1.6.2.dist-info/RECORD,,

{duckrun-0.1.6.2.dist-info → duckrun-0.1.6.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{duckrun-0.1.6.2.dist-info → duckrun-0.1.6.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{duckrun-0.1.6.2.dist-info → duckrun-0.1.6.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

duckrun 0.1.6.2__py3-none-any.whl → 0.1.6.3__py3-none-any.whl

duckrun 0.1.6.2py3-none-any.whl → 0.1.6.3py3-none-any.whl