pidatametrics1 0.3.5__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pidatametrics1
3
- Version: 0.3.5
4
- Summary: A wrapper for Pi Datametrics API with CSV and BigQuery support.
3
+ Version: 0.3.7
4
+ Summary: A test wrapper for Pi Datametrics API with CSV and BigQuery support.
5
5
  Requires-Dist: google-auth
6
6
  Requires-Dist: google-cloud-bigquery
7
7
  Requires-Dist: gspread
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pidatametrics1"
7
- version = "0.3.5"
8
- description = "A wrapper for Pi Datametrics API with CSV and BigQuery support."
7
+ version = "0.3.7"
8
+ description = "A test wrapper for Pi Datametrics API with CSV and BigQuery support."
9
9
  dependencies = [
10
10
  "requests",
11
11
  "google-cloud-bigquery",
@@ -1,12 +1,15 @@
1
1
  import csv
2
- import json
2
+ import re
3
+ import math
3
4
  from google.cloud import bigquery
4
5
 
5
- # Optional imports with error handling
6
+ # Optional imports
6
7
  try:
7
8
  import pandas as pd
9
+ from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
8
10
  except ImportError:
9
11
  pd = None
12
+ ILLEGAL_CHARACTERS_RE = None
10
13
 
11
14
  try:
12
15
  import gspread
@@ -15,6 +18,17 @@ except ImportError:
15
18
  gspread = None
16
19
 
17
20
  class PiExporter:
21
+
22
+ @staticmethod
23
+ def _clean_for_excel(text):
24
+ """Removes characters that cause Excel to crash/corrupt."""
25
+ if not isinstance(text, str):
26
+ return text
27
+ # Remove illegal control characters (null bytes, etc.)
28
+ if ILLEGAL_CHARACTERS_RE:
29
+ return ILLEGAL_CHARACTERS_RE.sub('', text)
30
+ return text
31
+
18
32
  @staticmethod
19
33
  def to_csv(data, filename):
20
34
  if not data:
@@ -33,9 +47,11 @@ class PiExporter:
33
47
 
34
48
  @staticmethod
35
49
  def to_excel(data, filename):
36
- """Exports data to Excel. Requires pandas and openpyxl."""
50
+ """
51
+ Exports to Excel with sanitization to prevent 'Corrupted File' errors.
52
+ """
37
53
  if pd is None:
38
- print("Error: Pandas not installed. Run: pip install pandas openpyxl")
54
+ print("Error: Pandas/Openpyxl not installed.")
39
55
  return
40
56
  if not data:
41
57
  print("No data to export.")
@@ -44,58 +60,87 @@ class PiExporter:
44
60
  if not filename.endswith('.xlsx'):
45
61
  filename += '.xlsx'
46
62
 
63
+ print(f"Preparing Excel file: {filename} ({len(data)} rows)...")
64
+
47
65
  try:
66
+ # 1. Create DataFrame
48
67
  df = pd.DataFrame(data)
49
- df.to_excel(filename, index=False)
68
+
69
+ # 2. Sanitize Data (Fixes "Excel found a problem with content")
70
+ # Apply cleaning to all string columns
71
+ for col in df.select_dtypes(include=['object']).columns:
72
+ df[col] = df[col].apply(PiExporter._clean_for_excel)
73
+
74
+ # 3. Export
75
+ df.to_excel(filename, index=False, engine='openpyxl')
50
76
  print(f"Successfully saved {len(data)} rows to {filename}")
77
+
51
78
  except Exception as e:
52
79
  print(f"Excel Export Failed: {e}")
53
80
 
54
81
  @staticmethod
55
82
  def to_google_sheet(data, spreadsheet_name, tab_name="Sheet1"):
56
- """
57
- Exports to Google Sheet using the Filename (not ID).
58
- Uses the active Colab authentication.
59
- """
60
83
  if gspread is None:
61
- print("Error: gspread not installed. Run: pip install gspread google-auth")
84
+ print("Error: gspread not installed.")
62
85
  return
63
86
  if not data:
64
87
  print("No data to upload.")
65
88
  return
66
89
 
67
- print(f"Connecting to Google Sheet: '{spreadsheet_name}'...")
90
+ row_count = len(data)
91
+ print(f"Preparing Google Sheet upload: {row_count} rows...")
92
+
93
+ # --- WARNING FOR LARGE DATASETS ---
94
+ if row_count > 50000:
95
+ print(f"⚠️ WARNING: You are uploading {row_count} rows.")
96
+ print(" Google Sheets may become slow. Uploading in chunks...")
68
97
 
69
98
  try:
70
- # 1. Get Default Credentials (works with Colab auth.authenticate_user)
99
+ # 1. Auth
71
100
  creds, _ = google.auth.default()
72
101
  client = gspread.authorize(creds)
73
102
 
74
- # 2. Open by Name (Title)
103
+ # 2. Open Sheet
75
104
  try:
76
105
  sh = client.open(spreadsheet_name)
77
106
  except gspread.SpreadsheetNotFound:
78
107
  print(f"Sheet '{spreadsheet_name}' not found. Creating it...")
79
108
  sh = client.create(spreadsheet_name)
80
109
 
81
- # 3. Select or Create Worksheet (Tab)
110
+ # 3. Setup Tab
82
111
  try:
83
112
  worksheet = sh.worksheet(tab_name)
84
- worksheet.clear() # Clear old data
113
+ worksheet.clear()
85
114
  except gspread.WorksheetNotFound:
86
- worksheet = sh.add_worksheet(title=tab_name, rows=len(data)+100, cols=20)
115
+ worksheet = sh.add_worksheet(title=tab_name, rows=row_count+100, cols=20)
87
116
 
88
117
  # 4. Prepare Data
89
118
  headers = list(data[0].keys())
90
119
  rows = [[row.get(col, '') for col in headers] for row in data]
91
- all_values = [headers] + rows
120
+
121
+ # 5. Upload Headers first
122
+ worksheet.update([headers], 'A1')
123
+
124
+ # 6. CHUNKED UPLOAD (To prevent timeouts on large data)
125
+ chunk_size = 5000 # Safe limit for gspread
126
+ total_chunks = math.ceil(len(rows) / chunk_size)
127
+
128
+ print(f"Starting upload in {total_chunks} chunks...")
129
+
130
+ for i in range(total_chunks):
131
+ start = i * chunk_size
132
+ end = start + chunk_size
133
+ chunk = rows[start:end]
134
+
135
+ # Append rows is safer for large datasets than update range
136
+ worksheet.append_rows(chunk, value_input_option='RAW')
137
+
138
+ print(f" - Uploaded chunk {i+1}/{total_chunks} ({len(chunk)} rows)")
92
139
 
93
- # 5. Update
94
- worksheet.update(all_values)
95
- print(f"Successfully uploaded {len(data)} rows to '{spreadsheet_name}' (Tab: {tab_name})")
140
+ print(f"✅ Successfully uploaded {row_count} rows to '{spreadsheet_name}' (Tab: {tab_name})")
96
141
 
97
142
  except Exception as e:
98
- print(f"Google Sheet Upload Failed: {e}")
143
+ print(f"Google Sheet Upload Failed: {e}")
99
144
 
100
145
  @staticmethod
101
146
  def to_bigquery(data, project_id, dataset_id, table_id):
@@ -6,22 +6,21 @@ from dateutil.relativedelta import relativedelta
6
6
 
7
7
  class PiReportManager(PiDataMetrics):
8
8
 
9
- # --- HELPER: Generate Unique Tab Name ---
10
- def _generate_tab_name(self, base_name, workspace_ref=None):
9
+ # --- HELPER: Generate Unique Name ---
10
+ def _generate_unique_name(self, base_name, workspace_ref=None):
11
11
  """
12
- Creates a tab name like: Volume_12345_0502_1430
12
+ Creates a unique name like: Hist_51780_0502_1430
13
13
  (Base_WorkspaceID_Date_Time)
14
14
  """
15
15
  now = datetime.datetime.now()
16
- timestamp = now.strftime("%d%m_%H%M") # e.g., 0502_1430 (5th Feb, 14:30)
16
+ timestamp = now.strftime("%d%m_%H%M") # e.g., 0502_1430
17
17
 
18
18
  ws_part = f"_{workspace_ref}" if workspace_ref else ""
19
19
 
20
- # Google Sheets tab limit is 31 chars.
21
- # Timestamp (9) + Base (approx 10) leaves ~10 for ID.
20
+ # Combine parts
22
21
  full_name = f"{base_name}{ws_part}_{timestamp}"
23
22
 
24
- # Truncate to 31 chars to avoid API errors
23
+ # Truncate to 31 chars (Google Sheets limit) just in case
25
24
  return full_name[:31]
26
25
 
27
26
  def _resolve_workspaces(self, ids_str=None, name_pattern=None):
@@ -62,20 +61,34 @@ class PiReportManager(PiDataMetrics):
62
61
  current_date -= relativedelta(months=1)
63
62
  return dates
64
63
 
65
- def _export_data(self, data, output_mode, filename, bq_config, spreadsheet_name, tab_name):
64
+ # --- UPDATED EXPORT LOGIC ---
65
+ def _export_data(self, data, output_mode, bq_config, spreadsheet_name, unique_name):
66
+ """
67
+ Handles export routing.
68
+ unique_name is used for:
69
+ - CSV Filename
70
+ - Excel Filename
71
+ - Google Sheet Tab Name
72
+ """
66
73
  if not data:
67
74
  print("No data to export.")
68
75
  return
69
76
 
70
77
  if output_mode == 'bigquery' and bq_config:
78
+ # BigQuery doesn't use filenames, it uses the Table ID in config
71
79
  PiExporter.to_bigquery(data, bq_config['project'], bq_config['dataset'], bq_config['table'])
80
+
72
81
  elif output_mode == 'excel':
73
- PiExporter.to_excel(data, filename)
82
+ # Use unique_name as filename
83
+ PiExporter.to_excel(data, unique_name)
84
+
74
85
  elif output_mode == 'gsheet' and spreadsheet_name:
75
- # tab_name is already generated with ID and Timestamp before calling this
76
- PiExporter.to_google_sheet(data, spreadsheet_name, tab_name)
86
+ # Use unique_name as Tab Name
87
+ PiExporter.to_google_sheet(data, spreadsheet_name, tab_name=unique_name)
88
+
77
89
  else:
78
- PiExporter.to_csv(data, filename)
90
+ # Default to CSV, use unique_name as filename
91
+ PiExporter.to_csv(data, unique_name)
79
92
 
80
93
  def run_volume_report(self, filename, workspace_ids=None, workspace_name=None, output_mode='csv', bq_config=None, spreadsheet_name=None):
81
94
  targets = self._resolve_workspaces(workspace_ids, workspace_name)
@@ -89,11 +102,11 @@ class PiReportManager(PiDataMetrics):
89
102
  rows = PiParsers.parse_volume_data(vol_data, stg['name'], terms, ws_name)
90
103
  all_rows.extend(rows)
91
104
 
92
- # Get the first Workspace ID found to use in the tab name
105
+ # Generate Unique Name
93
106
  ws_ref = list(targets.keys())[0] if targets else "Multi"
94
- unique_tab = self._generate_tab_name("Vol", ws_ref)
107
+ unique_name = self._generate_unique_name("Vol", ws_ref)
95
108
 
96
- self._export_data(all_rows, output_mode, filename, bq_config, spreadsheet_name, tab_name=unique_tab)
109
+ self._export_data(all_rows, output_mode, bq_config, spreadsheet_name, unique_name)
97
110
 
98
111
  def run_serp_report(self, data_sources, output_mode='csv', bq_config=None, filename=None, manual_duplication=None, spreadsheet_name=None):
99
112
  yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
@@ -105,11 +118,11 @@ class PiReportManager(PiDataMetrics):
105
118
  rows = PiParsers.parse_serp_response(raw_data, market, w_name, se_name, yesterday, cat_map, manual_duplication)
106
119
  all_rows.extend(rows)
107
120
 
108
- # Use the Workspace ID from the first data source
121
+ # Generate Unique Name
109
122
  ws_ref = data_sources[0][1] if data_sources else "All"
110
- unique_tab = self._generate_tab_name("SERP", ws_ref)
123
+ unique_name = self._generate_unique_name("SERP", ws_ref)
111
124
 
112
- self._export_data(all_rows, output_mode, filename or "serp_output", bq_config, spreadsheet_name, tab_name=unique_tab)
125
+ self._export_data(all_rows, output_mode, bq_config, spreadsheet_name, unique_name)
113
126
 
114
127
  def run_historical_serp_report(self, data_sources, duration, frequency, start_date=None, features=None, num_results=25, output_mode='csv', bq_config=None, filename="historical_data", spreadsheet_name=None):
115
128
  if features is None:
@@ -141,19 +154,21 @@ class PiReportManager(PiDataMetrics):
141
154
  except Exception as e:
142
155
  print(f"Failed to fetch {w_name} on {date}: {e}")
143
156
 
157
+ # BigQuery uploads immediately per day
144
158
  if output_mode == 'bigquery' and bq_config:
145
159
  if daily_rows:
146
160
  print(f"Uploading {len(daily_rows)} rows for {date} to BigQuery...")
147
161
  PiExporter.to_bigquery(daily_rows, bq_config['project'], bq_config['dataset'], bq_config['table'])
162
+ # Others accumulate
148
163
  elif output_mode in ['csv', 'excel', 'gsheet']:
149
164
  all_file_rows.extend(daily_rows)
150
165
 
166
+ # Final Export for Files
151
167
  if output_mode in ['csv', 'excel', 'gsheet']:
152
- # Use the Workspace ID from the first data source
153
168
  ws_ref = data_sources[0][1] if data_sources else "All"
154
- unique_tab = self._generate_tab_name("Hist", ws_ref)
169
+ unique_name = self._generate_unique_name("Hist", ws_ref)
155
170
 
156
- self._export_data(all_file_rows, output_mode, filename, bq_config, spreadsheet_name, tab_name=unique_tab)
171
+ self._export_data(all_file_rows, output_mode, bq_config, spreadsheet_name, unique_name)
157
172
 
158
173
  def run_llm_report(self, data_sources, start_period, end_period, stg_ids=None, output_mode='csv', bq_config=None, filename="llm_output", spreadsheet_name=None):
159
174
  all_rows = []
@@ -170,8 +185,8 @@ class PiReportManager(PiDataMetrics):
170
185
  except Exception as e:
171
186
  print(f"Failed to fetch LLM data for {w_name}: {e}")
172
187
 
173
- # Use the Workspace ID from the first data source
188
+ # Generate Unique Name
174
189
  ws_ref = data_sources[0][1] if data_sources else "All"
175
- unique_tab = self._generate_tab_name("LLM", ws_ref)
190
+ unique_name = self._generate_unique_name("LLM", ws_ref)
176
191
 
177
- self._export_data(all_rows, output_mode, filename, bq_config, spreadsheet_name, tab_name=unique_tab)
192
+ self._export_data(all_rows, output_mode, bq_config, spreadsheet_name, unique_name)
File without changes