PyPI - pidatametrics1 - Versions diffs - 0.3.5__py2.py3-none-any.whl → 0.3.7__py2.py3-none-any.whl - Mend

pidatametrics1 0.3.5py2.py3-none-any.whl → 0.3.7py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

pidatametrics/exporter.py +66 -21
pidatametrics/manager.py +39 -24
{pidatametrics1-0.3.5.dist-info → pidatametrics1-0.3.7.dist-info}/METADATA +2 -2
pidatametrics1-0.3.7.dist-info/RECORD +8 -0
pidatametrics1-0.3.5.dist-info/RECORD +0 -8
{pidatametrics1-0.3.5.dist-info → pidatametrics1-0.3.7.dist-info}/WHEEL +0 -0

pidatametrics/exporter.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import csv
-import json
+import re
+import math
 from google.cloud import bigquery
-# Optional imports with error handling
+# Optional imports
 try:
     import pandas as pd
+    from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
 except ImportError:
     pd = None
+    ILLEGAL_CHARACTERS_RE = None
 try:
     import gspread
@@ -15,6 +18,17 @@ except ImportError:
     gspread = None
 class PiExporter:
+    @staticmethod
+    def _clean_for_excel(text):
+        """Removes characters that cause Excel to crash/corrupt."""
+        if not isinstance(text, str):
+            return text
+        # Remove illegal control characters (null bytes, etc.)
+        if ILLEGAL_CHARACTERS_RE:
+            return ILLEGAL_CHARACTERS_RE.sub('', text)
+        return text
     @staticmethod
     def to_csv(data, filename):
         if not data:
@@ -33,9 +47,11 @@ class PiExporter:
     @staticmethod
     def to_excel(data, filename):
-        """Exports data to Excel. Requires pandas and openpyxl."""
+        """
+        Exports to Excel with sanitization to prevent 'Corrupted File' errors.
+        """
         if pd is None:
-            print("Error: Pandas not installed. Run: pip install pandas openpyxl")
+            print("Error: Pandas/Openpyxl not installed.")
             return
         if not data:
             print("No data to export.")
@@ -44,58 +60,87 @@ class PiExporter:
         if not filename.endswith('.xlsx'):
             filename += '.xlsx'
+        print(f"Preparing Excel file: {filename} ({len(data)} rows)...")
         try:
+            # 1. Create DataFrame
             df = pd.DataFrame(data)
-            df.to_excel(filename, index=False)
+            # 2. Sanitize Data (Fixes "Excel found a problem with content")
+            # Apply cleaning to all string columns
+            for col in df.select_dtypes(include=['object']).columns:
+                df[col] = df[col].apply(PiExporter._clean_for_excel)
+            # 3. Export
+            df.to_excel(filename, index=False, engine='openpyxl')
             print(f"Successfully saved {len(data)} rows to {filename}")
         except Exception as e:
             print(f"Excel Export Failed: {e}")
     @staticmethod
     def to_google_sheet(data, spreadsheet_name, tab_name="Sheet1"):
-        """
-        Exports to Google Sheet using the Filename (not ID).
-        Uses the active Colab authentication.
-        """
         if gspread is None:
-            print("Error: gspread not installed. Run: pip install gspread google-auth")
+            print("Error: gspread not installed.")
             return
         if not data:
             print("No data to upload.")
             return
-        print(f"Connecting to Google Sheet: '{spreadsheet_name}'...")
+        row_count = len(data)
+        print(f"Preparing Google Sheet upload: {row_count} rows...")
+        # --- WARNING FOR LARGE DATASETS ---
+        if row_count > 50000:
+            print(f"⚠️  WARNING: You are uploading {row_count} rows.")
+            print("   Google Sheets may become slow. Uploading in chunks...")
         try:
-            # 1. Get Default Credentials (works with Colab auth.authenticate_user)
+            # 1. Auth
             creds, _ = google.auth.default()
             client = gspread.authorize(creds)
-            # 2. Open by Name (Title)
+            # 2. Open Sheet
             try:
                 sh = client.open(spreadsheet_name)
             except gspread.SpreadsheetNotFound:
                 print(f"Sheet '{spreadsheet_name}' not found. Creating it...")
                 sh = client.create(spreadsheet_name)
-            # 3. Select or Create Worksheet (Tab)
+            # 3. Setup Tab
             try:
                 worksheet = sh.worksheet(tab_name)
-                worksheet.clear() # Clear old data
+                worksheet.clear()
             except gspread.WorksheetNotFound:
-                worksheet = sh.add_worksheet(title=tab_name, rows=len(data)+100, cols=20)
+                worksheet = sh.add_worksheet(title=tab_name, rows=row_count+100, cols=20)
             # 4. Prepare Data
             headers = list(data[0].keys())
             rows = [[row.get(col, '') for col in headers] for row in data]
-            all_values = [headers] + rows
+            # 5. Upload Headers first
+            worksheet.update([headers], 'A1')
+            # 6. CHUNKED UPLOAD (To prevent timeouts on large data)
+            chunk_size = 5000  # Safe limit for gspread
+            total_chunks = math.ceil(len(rows) / chunk_size)
+            print(f"Starting upload in {total_chunks} chunks...")
+            for i in range(total_chunks):
+                start = i * chunk_size
+                end = start + chunk_size
+                chunk = rows[start:end]
+                # Append rows is safer for large datasets than update range
+                worksheet.append_rows(chunk, value_input_option='RAW')
+                print(f"   - Uploaded chunk {i+1}/{total_chunks} ({len(chunk)} rows)")
-            # 5. Update
-            worksheet.update(all_values)
-            print(f"Successfully uploaded {len(data)} rows to '{spreadsheet_name}' (Tab: {tab_name})")
+            print(f"✅ Successfully uploaded {row_count} rows to '{spreadsheet_name}' (Tab: {tab_name})")
         except Exception as e:
-            print(f"Google Sheet Upload Failed: {e}")
+            print(f"❌ Google Sheet Upload Failed: {e}")
     @staticmethod
     def to_bigquery(data, project_id, dataset_id, table_id):

pidatametrics/manager.py CHANGED Viewed

@@ -6,22 +6,21 @@ from dateutil.relativedelta import relativedelta
 class PiReportManager(PiDataMetrics):
-    # --- HELPER: Generate Unique Tab Name ---
-    def _generate_tab_name(self, base_name, workspace_ref=None):
+    # --- HELPER: Generate Unique Name ---
+    def _generate_unique_name(self, base_name, workspace_ref=None):
         """
-        Creates a tab name like: Volume_12345_0502_1430
+        Creates a unique name like: Hist_51780_0502_1430
         (Base_WorkspaceID_Date_Time)
         """
         now = datetime.datetime.now()
-        timestamp = now.strftime("%d%m_%H%M") # e.g., 0502_1430 (5th Feb, 14:30)
+        timestamp = now.strftime("%d%m_%H%M") # e.g., 0502_1430
         ws_part = f"_{workspace_ref}" if workspace_ref else ""
-        # Google Sheets tab limit is 31 chars.
-        # Timestamp (9) + Base (approx 10) leaves ~10 for ID.
+        # Combine parts
         full_name = f"{base_name}{ws_part}_{timestamp}"
-        # Truncate to 31 chars to avoid API errors
+        # Truncate to 31 chars (Google Sheets limit) just in case
         return full_name[:31]
     def _resolve_workspaces(self, ids_str=None, name_pattern=None):
@@ -62,20 +61,34 @@ class PiReportManager(PiDataMetrics):
                 current_date -= relativedelta(months=1)
         return dates
-    def _export_data(self, data, output_mode, filename, bq_config, spreadsheet_name, tab_name):
+    # --- UPDATED EXPORT LOGIC ---
+    def _export_data(self, data, output_mode, bq_config, spreadsheet_name, unique_name):
+        """
+        Handles export routing.
+        unique_name is used for:
+        - CSV Filename
+        - Excel Filename
+        - Google Sheet Tab Name
+        """
         if not data:
             print("No data to export.")
             return
         if output_mode == 'bigquery' and bq_config:
+            # BigQuery doesn't use filenames, it uses the Table ID in config
             PiExporter.to_bigquery(data, bq_config['project'], bq_config['dataset'], bq_config['table'])
         elif output_mode == 'excel':
-            PiExporter.to_excel(data, filename)
+            # Use unique_name as filename
+            PiExporter.to_excel(data, unique_name)
         elif output_mode == 'gsheet' and spreadsheet_name:
-            # tab_name is already generated with ID and Timestamp before calling this
-            PiExporter.to_google_sheet(data, spreadsheet_name, tab_name)
+            # Use unique_name as Tab Name
+            PiExporter.to_google_sheet(data, spreadsheet_name, tab_name=unique_name)
         else:
-            PiExporter.to_csv(data, filename)
+            # Default to CSV, use unique_name as filename
+            PiExporter.to_csv(data, unique_name)
     def run_volume_report(self, filename, workspace_ids=None, workspace_name=None, output_mode='csv', bq_config=None, spreadsheet_name=None):
         targets = self._resolve_workspaces(workspace_ids, workspace_name)
@@ -89,11 +102,11 @@ class PiReportManager(PiDataMetrics):
                 rows = PiParsers.parse_volume_data(vol_data, stg['name'], terms, ws_name)
                 all_rows.extend(rows)
-        # Get the first Workspace ID found to use in the tab name
+        # Generate Unique Name
         ws_ref = list(targets.keys())[0] if targets else "Multi"
-        unique_tab = self._generate_tab_name("Vol", ws_ref)
+        unique_name = self._generate_unique_name("Vol", ws_ref)
-        self._export_data(all_rows, output_mode, filename, bq_config, spreadsheet_name, tab_name=unique_tab)
+        self._export_data(all_rows, output_mode, bq_config, spreadsheet_name, unique_name)
     def run_serp_report(self, data_sources, output_mode='csv', bq_config=None, filename=None, manual_duplication=None, spreadsheet_name=None):
         yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
@@ -105,11 +118,11 @@ class PiReportManager(PiDataMetrics):
             rows = PiParsers.parse_serp_response(raw_data, market, w_name, se_name, yesterday, cat_map, manual_duplication)
             all_rows.extend(rows)
-        # Use the Workspace ID from the first data source
+        # Generate Unique Name
         ws_ref = data_sources[0][1] if data_sources else "All"
-        unique_tab = self._generate_tab_name("SERP", ws_ref)
+        unique_name = self._generate_unique_name("SERP", ws_ref)
-        self._export_data(all_rows, output_mode, filename or "serp_output", bq_config, spreadsheet_name, tab_name=unique_tab)
+        self._export_data(all_rows, output_mode, bq_config, spreadsheet_name, unique_name)
     def run_historical_serp_report(self, data_sources, duration, frequency, start_date=None, features=None, num_results=25, output_mode='csv', bq_config=None, filename="historical_data", spreadsheet_name=None):
         if features is None:
@@ -141,19 +154,21 @@ class PiReportManager(PiDataMetrics):
                 except Exception as e:
                     print(f"Failed to fetch {w_name} on {date}: {e}")
+            # BigQuery uploads immediately per day
             if output_mode == 'bigquery' and bq_config:
                 if daily_rows:
                     print(f"Uploading {len(daily_rows)} rows for {date} to BigQuery...")
                     PiExporter.to_bigquery(daily_rows, bq_config['project'], bq_config['dataset'], bq_config['table'])
+            # Others accumulate
             elif output_mode in ['csv', 'excel', 'gsheet']:
                 all_file_rows.extend(daily_rows)
+        # Final Export for Files
         if output_mode in ['csv', 'excel', 'gsheet']:
-            # Use the Workspace ID from the first data source
             ws_ref = data_sources[0][1] if data_sources else "All"
-            unique_tab = self._generate_tab_name("Hist", ws_ref)
+            unique_name = self._generate_unique_name("Hist", ws_ref)
-            self._export_data(all_file_rows, output_mode, filename, bq_config, spreadsheet_name, tab_name=unique_tab)
+            self._export_data(all_file_rows, output_mode, bq_config, spreadsheet_name, unique_name)
     def run_llm_report(self, data_sources, start_period, end_period, stg_ids=None, output_mode='csv', bq_config=None, filename="llm_output", spreadsheet_name=None):
         all_rows = []
@@ -170,8 +185,8 @@ class PiReportManager(PiDataMetrics):
             except Exception as e:
                 print(f"Failed to fetch LLM data for {w_name}: {e}")
-        # Use the Workspace ID from the first data source
+        # Generate Unique Name
         ws_ref = data_sources[0][1] if data_sources else "All"
-        unique_tab = self._generate_tab_name("LLM", ws_ref)
+        unique_name = self._generate_unique_name("LLM", ws_ref)
-        self._export_data(all_rows, output_mode, filename, bq_config, spreadsheet_name, tab_name=unique_tab)
+        self._export_data(all_rows, output_mode, bq_config, spreadsheet_name, unique_name)

{pidatametrics1-0.3.5.dist-info → pidatametrics1-0.3.7.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: pidatametrics1
-Version: 0.3.5
-Summary: A wrapper for Pi Datametrics API with CSV and BigQuery support.
+Version: 0.3.7
+Summary: A test wrapper for Pi Datametrics API with CSV and BigQuery support.
 Requires-Dist: google-auth
 Requires-Dist: google-cloud-bigquery
 Requires-Dist: gspread

pidatametrics1-0.3.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+pidatametrics/__init__.py,sha256=cmNSHvjvMsYO1tMv0Nf-7LGjIJ8AFXmUIwiv8jQ34BI,137
+pidatametrics/client.py,sha256=tHH0GV0rk2SizVqRdKepjdDQevkfdWlHOJHwsPR2PCk,4399
+pidatametrics/exporter.py,sha256=yLN40kqwibHWs45gXKe_I1j9td9lJt-8LLFSS-Pk9-U,5813
+pidatametrics/manager.py,sha256=tUeeJ-wKAlhpWsaZEAjxtZCtA2EbQcTBB1JkXPEVV50,9101
+pidatametrics/parsers.py,sha256=fiLx3080wNubT1VqSIeDvlrKT85KdqlKhY6FaB2XuC8,5989
+pidatametrics1-0.3.7.dist-info/METADATA,sha256=3rqys7d0-SNpsghd25f3eS99WSl-1QYzTY0HjbAYNak,293
+pidatametrics1-0.3.7.dist-info/WHEEL,sha256=aha0VrrYvgDJ3Xxl3db_g_MDIW-ZexDdrc_m-Hk8YY4,105
+pidatametrics1-0.3.7.dist-info/RECORD,,

pidatametrics1-0.3.5.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-pidatametrics/__init__.py,sha256=cmNSHvjvMsYO1tMv0Nf-7LGjIJ8AFXmUIwiv8jQ34BI,137
-pidatametrics/client.py,sha256=tHH0GV0rk2SizVqRdKepjdDQevkfdWlHOJHwsPR2PCk,4399
-pidatametrics/exporter.py,sha256=CcsdVhxI6rXi0zlQaYzFEGX0GL3ZaNV94Pj5r_WrZc4,4226
-pidatametrics/manager.py,sha256=Sz4ecxwtY-lVjDjXsYO_rmLbK5_o9ZU0Fdv1MK50r40,8899
-pidatametrics/parsers.py,sha256=fiLx3080wNubT1VqSIeDvlrKT85KdqlKhY6FaB2XuC8,5989
-pidatametrics1-0.3.5.dist-info/METADATA,sha256=GhLbrT6GUpcUq5F2PIzW5ThcnR-ApKkCL1WNkCBACzM,288
-pidatametrics1-0.3.5.dist-info/WHEEL,sha256=aha0VrrYvgDJ3Xxl3db_g_MDIW-ZexDdrc_m-Hk8YY4,105
-pidatametrics1-0.3.5.dist-info/RECORD,,

{pidatametrics1-0.3.5.dist-info → pidatametrics1-0.3.7.dist-info}/WHEEL RENAMED Viewed

File without changes

pidatametrics1 0.3.5__py2.py3-none-any.whl → 0.3.7__py2.py3-none-any.whl

pidatametrics1 0.3.5py2.py3-none-any.whl → 0.3.7py2.py3-none-any.whl