pidatametrics 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.4
2
+ Name: pidatametrics
3
+ Version: 0.3.0
4
+ Summary: A wrapper for Pi Datametrics API with CSV and BigQuery support.
5
+ Requires-Dist: google-cloud-bigquery
6
+ Requires-Dist: requests
File without changes
@@ -0,0 +1,15 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pidatametrics"
7
+ version = "0.3.0"
8
+ description = "A wrapper for Pi Datametrics API with CSV and BigQuery support."
9
+ dependencies = [
10
+ "requests",
11
+ "google-cloud-bigquery"
12
+ ]
13
+
14
+ [tool.hatch.build.targets.wheel]
15
+ packages = ["src/pidatametrics"]
@@ -0,0 +1,4 @@
1
+ from .client import PiDataMetrics
2
+ from .parsers import PiParsers
3
+ from .exporter import PiExporter
4
+ from .manager import PiReportManager
@@ -0,0 +1,76 @@
1
+ import requests
2
+ from requests.auth import HTTPBasicAuth
3
+
4
+ class PiDataMetrics:
5
+ def __init__(self, client_id, client_secret, account_id=1377):
6
+ self.account_id = account_id # Store account_id for global endpoints
7
+ self.auth_url = "https://app.pi-datametrics.com/api/auth"
8
+ self.base_url = f"https://app.pi-datametrics.com/api/accounts/{account_id}"
9
+ self.access_token = self._get_access_token(client_id, client_secret)
10
+ self.headers = {
11
+ 'Authorization': f'Bearer {self.access_token}',
12
+ 'Content-Type': 'application/json'
13
+ }
14
+
15
+ def _get_access_token(self, client_id, client_secret):
16
+ data = {"grant_type": "client_credentials"}
17
+ auth = HTTPBasicAuth(client_id, client_secret)
18
+ try:
19
+ response = requests.post(self.auth_url, data=data, auth=auth)
20
+ response.raise_for_status()
21
+ return response.json()['access_token']
22
+ except requests.exceptions.RequestException as e:
23
+ raise SystemExit(f"Authentication Failed: {e}")
24
+
25
+ # --- Generic Request Handler ---
26
+ def fetch_endpoint(self, endpoint_path, params=None):
27
+ """Generic method to fetch any endpoint relative to account base URL"""
28
+ url = f"{self.base_url}/{endpoint_path}"
29
+ response = requests.get(url, headers=self.headers, params=params)
30
+ response.raise_for_status()
31
+ return response.json().get('data', [])
32
+
33
+ # --- Specific Endpoint Wrappers ---
34
+ def get_workspaces(self):
35
+ return self.fetch_endpoint("workspaces")
36
+
37
+ def get_stgs(self, workspace_id):
38
+ return self.fetch_endpoint(f"workspaces/{workspace_id}/search-term-groups")
39
+
40
+ def get_search_terms(self, workspace_id, stg_id):
41
+ return self.fetch_endpoint(f"workspaces/{workspace_id}/search-term-groups/{stg_id}/search-terms")
42
+
43
+ def get_bulk_serp_data(self, workspace_id, search_engine_id, period, **kwargs):
44
+ params = {"search-engine-id": search_engine_id, "period": period}
45
+ params.update(kwargs)
46
+ return self.fetch_endpoint(f"workspaces/{workspace_id}/search-data/bulk-search-results", params=params)
47
+
48
+ def get_bulk_volume(self, workspace_id, start_date=None, end_date=None):
49
+ params = {}
50
+ if start_date and end_date:
51
+ params = {'start-period': start_date, 'end-period': end_date}
52
+ return self.fetch_endpoint(f"workspaces/{workspace_id}/volume-data/bulk-search-volume", params=params)
53
+
54
+ # --- NEW: LLM Mentions Endpoint ---
55
+ def get_llm_mentions(self, workspace_id, search_engine_id, start_period, end_period, stg_ids=None):
56
+ """
57
+ Fetches LLM citation data.
58
+ Note: This endpoint uses a different base path (/api/data/) than the standard account endpoints.
59
+ """
60
+ url = "https://app.pi-datametrics.com/api/data/llm/mentions"
61
+
62
+ params = {
63
+ "account-id": self.account_id,
64
+ "workspace-id": workspace_id,
65
+ "search-engine-id": search_engine_id,
66
+ "start-period": start_period,
67
+ "end-period": end_period
68
+ }
69
+
70
+ if stg_ids:
71
+ # API expects array of integers
72
+ params["search-term-group-id[]"] = stg_ids
73
+
74
+ response = requests.get(url, headers=self.headers, params=params)
75
+ response.raise_for_status()
76
+ return response.json().get('data', [])
@@ -0,0 +1,62 @@
1
+ import csv
2
+ from google.cloud import bigquery
3
+
4
+ class PiExporter:
5
+ @staticmethod
6
+ def to_csv(data, filename):
7
+ if not data:
8
+ print("No data to export.")
9
+ return
10
+
11
+ if not filename.endswith('.csv'):
12
+ filename += '.csv'
13
+
14
+ keys = data[0].keys()
15
+ with open(filename, 'w', newline='', encoding='utf-8') as f:
16
+ writer = csv.DictWriter(f, fieldnames=keys)
17
+ writer.writeheader()
18
+ writer.writerows(data)
19
+ print(f"Successfully saved {len(data)} rows to {filename}")
20
+
21
+ @staticmethod
22
+ def to_bigquery(data, project_id, dataset_id, table_id):
23
+ if not data:
24
+ print("No data to upload.")
25
+ return
26
+
27
+ client = bigquery.Client(project=project_id)
28
+ table_ref = f"{project_id}.{dataset_id}.{table_id}"
29
+
30
+ print(f"Uploading {len(data)} rows to BigQuery table {table_ref}...")
31
+
32
+ # Auto-detect schema is usually fine for JSON inserts,
33
+ # but explicit schema is safer. For generic use, we try auto-detect first.
34
+ job_config = bigquery.LoadJobConfig(
35
+ autodetect=True,
36
+ source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
37
+ write_disposition=bigquery.WriteDisposition.WRITE_APPEND
38
+ )
39
+
40
+ try:
41
+ # --- CHANGE STARTED HERE ---
42
+ # Old Code (Caused SSL Error):
43
+ # errors = client.insert_rows_json(table_ref, data)
44
+
45
+ # New Code (Uses Batch Load + your job_config):
46
+ job = client.load_table_from_json(
47
+ data,
48
+ table_ref,
49
+ job_config=job_config
50
+ )
51
+
52
+ # Wait for the job to complete (this is required for batch loads)
53
+ job.result()
54
+
55
+ print(f"Upload successful. Loaded {job.output_rows} rows.")
56
+ # --- CHANGE ENDED HERE ---
57
+
58
+ except Exception as e:
59
+ print(f"BigQuery Upload Failed: {e}")
60
+ # Optional: Print detailed error list if available in exception
61
+ if hasattr(e, 'errors'):
62
+ print(e.errors)
@@ -0,0 +1,159 @@
1
+ from .client import PiDataMetrics
2
+ from .parsers import PiParsers
3
+ from .exporter import PiExporter
4
+ import datetime
5
+ from dateutil.relativedelta import relativedelta
6
+
7
+ class PiReportManager(PiDataMetrics):
8
+ def _resolve_workspaces(self, ids_str=None, name_pattern=None):
9
+ all_ws = self.get_workspaces()
10
+ targets = {}
11
+ if ids_str and ids_str.strip():
12
+ target_ids = [int(x.strip()) for x in ids_str.split(',') if x.strip().isdigit()]
13
+ for ws in all_ws:
14
+ if ws['id'] in target_ids:
15
+ targets[ws['id']] = ws['name']
16
+ elif name_pattern:
17
+ for ws in all_ws:
18
+ if ws.get('tracked') and name_pattern.lower() in ws['name'].lower():
19
+ targets[ws['id']] = ws['name']
20
+ return targets
21
+
22
+ def _generate_historical_dates(self, start_date_str, duration, frequency):
23
+ dates = []
24
+ try:
25
+ current_date = datetime.datetime.strptime(start_date_str, "%Y-%m-%d")
26
+ except ValueError:
27
+ print(f"Error: Invalid date format {start_date_str}. Using yesterday.")
28
+ current_date = datetime.datetime.now() - datetime.timedelta(days=1)
29
+ if current_date > datetime.datetime.now():
30
+ print(f"WARNING: Start date {current_date.strftime('%Y-%m-%d')} is in the future!")
31
+ if frequency == 'weekly':
32
+ days_since_sunday = (current_date.weekday() + 1) % 7
33
+ if days_since_sunday > 0:
34
+ current_date -= datetime.timedelta(days=days_since_sunday)
35
+ print(f"Note: Adjusted start date to previous Sunday: {current_date.strftime('%Y-%m-%d')}")
36
+ for _ in range(int(duration)):
37
+ dates.append(current_date.strftime("%Y-%m-%d"))
38
+ if frequency == 'daily':
39
+ current_date -= datetime.timedelta(days=1)
40
+ elif frequency == 'weekly':
41
+ current_date -= datetime.timedelta(weeks=1)
42
+ elif frequency == 'monthly':
43
+ current_date -= relativedelta(months=1)
44
+ return dates
45
+
46
+ def run_volume_report(self, filename, workspace_ids=None, workspace_name=None, output_mode='csv', bq_config=None):
47
+ targets = self._resolve_workspaces(workspace_ids, workspace_name)
48
+ if not targets: return
49
+ all_rows = []
50
+ for ws_id, ws_name in targets.items():
51
+ vol_data = self.get_bulk_volume(ws_id)
52
+ stgs = self.get_stgs(ws_id)
53
+ for stg in stgs:
54
+ terms = self.get_search_terms(ws_id, stg['id'])
55
+ rows = PiParsers.parse_volume_data(vol_data, stg['name'], terms, ws_name)
56
+ all_rows.extend(rows)
57
+
58
+ if output_mode == 'bigquery' and bq_config:
59
+ PiExporter.to_bigquery(all_rows, bq_config['project'], bq_config['dataset'], bq_config['table'])
60
+ else:
61
+ PiExporter.to_csv(all_rows, filename)
62
+
63
+ def run_serp_report(self, data_sources, output_mode='csv', bq_config=None, filename=None, manual_duplication=None):
64
+ yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
65
+ all_rows = []
66
+ for source in data_sources:
67
+ market, w_id, w_name, se_id, se_name = source
68
+ raw_data = self.get_bulk_serp_data(w_id, se_id, yesterday)
69
+ cat_map = PiParsers.build_category_map(self, w_id)
70
+ rows = PiParsers.parse_serp_response(raw_data, market, w_name, se_name, yesterday, cat_map, manual_duplication)
71
+ all_rows.extend(rows)
72
+
73
+ if output_mode == 'bigquery' and bq_config:
74
+ PiExporter.to_bigquery(all_rows, bq_config['project'], bq_config['dataset'], bq_config['table'])
75
+ else:
76
+ PiExporter.to_csv(all_rows, filename or "serp_output.csv")
77
+
78
+ def run_historical_serp_report(self, data_sources, duration, frequency, start_date=None, features=None, num_results=25, output_mode='csv', bq_config=None, filename="historical_data.csv"):
79
+ if features is None:
80
+ features = ['classicLink', 'popularProducts']
81
+
82
+ if not start_date:
83
+ start_date = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
84
+
85
+ target_dates = self._generate_historical_dates(start_date, duration, frequency)
86
+
87
+ print(f"Starting Historical Report ({frequency}) for last {duration} periods...")
88
+
89
+ all_csv_rows = [] # Only used if output_mode is CSV
90
+
91
+ for i, date in enumerate(target_dates):
92
+ print(f"[{i+1}/{len(target_dates)}] Processing Date: {date}...")
93
+
94
+ daily_rows = [] # Reset container for this specific date
95
+
96
+ for source in data_sources:
97
+ market, w_id, w_name, se_id, se_name = source
98
+ try:
99
+ params = {
100
+ 'serp-feature[]': features,
101
+ 'number-of-results': num_results
102
+ }
103
+ raw_data = self.get_bulk_serp_data(w_id, se_id, date, **params)
104
+
105
+ # Parser handles filtering null titles
106
+ rows = PiParsers.parse_serp_response(
107
+ raw_data, market, w_name, se_name, date, category_map=None
108
+ )
109
+
110
+ daily_rows.extend(rows)
111
+
112
+ except Exception as e:
113
+ print(f"Failed to fetch {w_name} on {date}: {e}")
114
+
115
+ # --- UPLOAD LOGIC: PER DATE ---
116
+ if output_mode == 'bigquery' and bq_config:
117
+ if daily_rows:
118
+ print(f"Uploading {len(daily_rows)} rows for {date} to BigQuery...")
119
+ PiExporter.to_bigquery(daily_rows, bq_config['project'], bq_config['dataset'], bq_config['table'])
120
+ else:
121
+ print(f"No data found for {date}, skipping upload.")
122
+
123
+ # --- CSV LOGIC: ACCUMULATE ---
124
+ elif output_mode == 'csv':
125
+ all_csv_rows.extend(daily_rows)
126
+
127
+ # Final Save for CSV only
128
+ if output_mode == 'csv':
129
+ PiExporter.to_csv(all_csv_rows, filename)
130
+
131
+ # --- NEW: LLM Report ---
132
+ def run_llm_report(self, data_sources, start_period, end_period, stg_ids=None, output_mode='csv', bq_config=None, filename="llm_output.csv"):
133
+ """
134
+ Runs the LLM Mentions report for the specified data sources and date range.
135
+
136
+ :param data_sources: List of tuples (market, workspace_id, workspace_name, search_engine_id, search_engine_name)
137
+ :param start_period: YYYY-MM-DD or YYYY-WWW
138
+ :param end_period: YYYY-MM-DD or YYYY-WWW
139
+ :param stg_ids: Optional list of integers to filter by Search Term Group
140
+ """
141
+ all_rows = []
142
+ print(f"Starting LLM Report from {start_period} to {end_period}...")
143
+
144
+ for source in data_sources:
145
+ market, w_id, w_name, se_id, se_name = source
146
+ try:
147
+ print(f"Fetching LLM data for {w_name} ({se_name})...")
148
+ raw_data = self.get_llm_mentions(w_id, se_id, start_period, end_period, stg_ids)
149
+
150
+ rows = PiParsers.parse_llm_response(raw_data, market, w_name, se_name)
151
+ all_rows.extend(rows)
152
+ print(f"Found {len(rows)} mentions/queries.")
153
+ except Exception as e:
154
+ print(f"Failed to fetch LLM data for {w_name}: {e}")
155
+
156
+ if output_mode == 'bigquery' and bq_config:
157
+ PiExporter.to_bigquery(all_rows, bq_config['project'], bq_config['dataset'], bq_config['table'])
158
+ else:
159
+ PiExporter.to_csv(all_rows, filename)
@@ -0,0 +1,156 @@
1
+ import json
2
+
3
+ class PiParsers:
4
+ @staticmethod
5
+ def parse_serp_response(raw_data, market, workspace_name, search_engine_name, date, category_map=None, manual_duplication=None):
6
+ """
7
+ Transforms the nested 'Bulk Search Results' JSON into a flat list of dicts.
8
+ """
9
+ flat_rows = []
10
+
11
+ for item in raw_data:
12
+ search_term = item.get('searchTerm')
13
+
14
+ category = ""
15
+ if category_map:
16
+ category = category_map.get(search_term, "")
17
+
18
+ results = item.get('results', [])
19
+ previous_pos = None
20
+
21
+ if results:
22
+ for res in results:
23
+ feature_type = res.get('feature')
24
+ title = res.get('title')
25
+
26
+ # If feature is popularProducts AND title is null/empty, SKIP this row.
27
+ if feature_type == 'popularProducts' and not title:
28
+ continue
29
+
30
+ # --- Logic for Position Fill-Down ---
31
+ pos = res.get('position')
32
+ if pos is None:
33
+ pos = previous_pos
34
+
35
+ # --- Logic for Attributes (Popular Products) ---
36
+ attributes = res.get('attributes', {})
37
+ price = None
38
+ site_name = None
39
+
40
+ if isinstance(attributes, dict):
41
+ price = attributes.get('price')
42
+ site_name = attributes.get('site')
43
+ attr_str = json.dumps(attributes)
44
+ else:
45
+ attr_str = None
46
+
47
+ # Create Row
48
+ row = {
49
+ 'Date': date,
50
+ 'Market': market,
51
+ 'SearchTerm': search_term,
52
+ 'URL': res.get('url'),
53
+ 'Position': pos,
54
+ 'SERPFeature': feature_type,
55
+ 'PageTitle': title,
56
+ 'Price': price,
57
+ 'SiteName': site_name,
58
+ 'SearchEngine': search_engine_name,
59
+ 'Attributes': attr_str,
60
+ 'Category': category,
61
+ 'Workspace': workspace_name
62
+ }
63
+
64
+ flat_rows.append(row)
65
+ previous_pos = pos
66
+ else:
67
+ pass
68
+
69
+ return flat_rows
70
+
71
+ @staticmethod
72
+ def parse_volume_data(volume_data, stg_name, stg_terms, workspace_name):
73
+ rows = []
74
+ volume_lookup = {item.get('search-term'): item for item in volume_data}
75
+
76
+ for term in stg_terms:
77
+ term_text = term if isinstance(term, str) else term.get('term', '')
78
+
79
+ if term_text in volume_lookup:
80
+ item = volume_lookup[term_text]
81
+ cpc = item.get('cpc', '')
82
+ monthly_volume = item.get('monthly-volume', {})
83
+
84
+ for month, vol in monthly_volume.items():
85
+ rows.append({
86
+ "Workspace": workspace_name,
87
+ "STG": stg_name,
88
+ "Search Term": term_text,
89
+ "Month": month,
90
+ "Search Volume": vol,
91
+ "CPC": cpc
92
+ })
93
+ return rows
94
+
95
+ @staticmethod
96
+ def build_category_map(pi_client, workspace_id):
97
+ mapping = {}
98
+ stgs = pi_client.get_stgs(workspace_id)
99
+ for stg in stgs:
100
+ terms = pi_client.get_search_terms(workspace_id, stg['id'])
101
+ for term in terms:
102
+ t_text = term if isinstance(term, str) else term.get('term')
103
+ mapping[t_text] = stg['name']
104
+ return mapping
105
+
106
+ # --- NEW: LLM Parser ---
107
+ @staticmethod
108
+ def parse_llm_response(raw_data, market, workspace_name, search_engine_name):
109
+ """
110
+ Flattens LLM mentions data.
111
+ Each mention becomes a row. If no mentions exist for a query, the query is still recorded.
112
+ """
113
+ rows = []
114
+ for item in raw_data:
115
+ period = item.get('period')
116
+ query = item.get('query')
117
+ response_text = item.get('response')
118
+ mentions = item.get('mentions', [])
119
+
120
+ base_row = {
121
+ 'Period': period,
122
+ 'Market': market,
123
+ 'Workspace': workspace_name,
124
+ 'SearchEngine': search_engine_name,
125
+ 'Query': query,
126
+ 'LLM_Response': response_text
127
+ }
128
+
129
+ if mentions:
130
+ for m in mentions:
131
+ row = base_row.copy()
132
+ row.update({
133
+ 'Entity': m.get('entity'),
134
+ 'Citation': m.get('citation'),
135
+ 'Source_Site': m.get('source_site'),
136
+ 'Source_Domain': m.get('source_domain'),
137
+ 'Source_URL': m.get('source_url'),
138
+ 'Sentiment': m.get('sentiment'),
139
+ 'Sentiment_Category': m.get('sentiment_category')
140
+ })
141
+ rows.append(row)
142
+ else:
143
+ # Record the query even if no citations found
144
+ row = base_row.copy()
145
+ row.update({
146
+ 'Entity': None,
147
+ 'Citation': None,
148
+ 'Source_Site': None,
149
+ 'Source_Domain': None,
150
+ 'Source_URL': None,
151
+ 'Sentiment': None,
152
+ 'Sentiment_Category': None
153
+ })
154
+ rows.append(row)
155
+
156
+ return rows