ingestr 0.12.2__py3-none-any.whl → 0.12.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -0,0 +1,70 @@
1
+ """Google analytics source helpers"""
2
+
3
+ from typing import Iterator, List
4
+
5
+ import dlt
6
+ from apiclient.discovery import Resource # type: ignore
7
+ from dlt.common import logger, pendulum
8
+ from dlt.common.typing import TDataItem
9
+ from google.analytics.data_v1beta.types import (
10
+ Dimension,
11
+ Metric,
12
+ )
13
+ from pendulum.datetime import DateTime
14
+
15
+ from .data_processing import get_report
16
+
17
+
18
+ def basic_report(
19
+ client: Resource,
20
+ rows_per_page: int,
21
+ dimensions: List[str],
22
+ metrics: List[str],
23
+ property_id: int,
24
+ resource_name: str,
25
+ start_date: str,
26
+ last_date: dlt.sources.incremental[DateTime],
27
+ ) -> Iterator[TDataItem]:
28
+ """
29
+ Retrieves the data for a report given dimensions, metrics, and filters required for the report.
30
+
31
+ Args:
32
+ client: The Google Analytics client used to make requests.
33
+ dimensions: Dimensions for the report. See metadata for the full list of dimensions.
34
+ metrics: Metrics for the report. See metadata for the full list of metrics.
35
+ property_id: A reference to the Google Analytics project.
36
+ More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
37
+ rows_per_page: Controls how many rows are retrieved per page in the reports.
38
+ Default is 10000, maximum possible is 100000.
39
+ resource_name: The resource name used to save incremental into dlt state.
40
+ start_date: Incremental load start_date.
41
+ Default is taken from dlt state if it exists.
42
+ last_date: Incremental load end date.
43
+ Default is taken from dlt state if it exists.
44
+
45
+ Returns:
46
+ Generator of all rows of data in the report.
47
+ """
48
+
49
+ # grab the start time from last dlt load if not filled, if that is also empty then use the first day of the millennium as the start time instead
50
+ if last_date.last_value:
51
+ if start_date != "2015-08-14":
52
+ logger.warning(
53
+ f"Using the starting date: {last_date.last_value} for incremental report: {resource_name} and ignoring start date passed as argument {start_date}"
54
+ )
55
+ start_date = last_date.last_value.to_date_string()
56
+ else:
57
+ start_date = start_date or "2015-08-14"
58
+
59
+ processed_response = get_report(
60
+ client=client,
61
+ property_id=property_id,
62
+ # fill dimensions and metrics with the proper api client objects
63
+ dimension_list=[Dimension(name=dimension) for dimension in dimensions],
64
+ metric_list=[Metric(name=metric) for metric in metrics],
65
+ limit=rows_per_page,
66
+ start_date=start_date,
67
+ # configure end_date to yesterday as a date string
68
+ end_date=pendulum.now().to_date_string(),
69
+ )
70
+ yield from processed_response
@@ -0,0 +1,176 @@
1
+ """
2
+ This module contains helpers that process data and make it ready for loading into the database
3
+ """
4
+
5
+ import json
6
+ from typing import Any, Iterator, List, Union
7
+
8
+ import proto
9
+ from dlt.common.exceptions import MissingDependencyException
10
+ from dlt.common.pendulum import pendulum
11
+ from dlt.common.typing import DictStrAny, TDataItem, TDataItems
12
+
13
+ try:
14
+ from google.analytics.data_v1beta import BetaAnalyticsDataClient # noqa: F401
15
+ from google.analytics.data_v1beta.types import (
16
+ DateRange,
17
+ Dimension,
18
+ DimensionExpression, # noqa: F401
19
+ DimensionMetadata, # noqa: F401
20
+ GetMetadataRequest, # noqa: F401
21
+ Metadata, # noqa: F401
22
+ Metric,
23
+ MetricMetadata, # noqa: F401
24
+ MetricType,
25
+ RunReportRequest,
26
+ RunReportResponse,
27
+ )
28
+ except ImportError:
29
+ raise MissingDependencyException(
30
+ "Google Analytics API Client", ["google-analytics-data"]
31
+ )
32
+ try:
33
+ from apiclient.discovery import Resource, build # type: ignore # noqa: F401
34
+ except ImportError:
35
+ raise MissingDependencyException("Google API Client", ["google-api-python-client"])
36
+
37
+
38
+ def to_dict(item: Any) -> Iterator[TDataItem]:
39
+ """
40
+ Processes a batch result (page of results per dimension) accordingly
41
+ :param batch:
42
+ :return:
43
+ """
44
+ item = json.loads(
45
+ proto.Message.to_json(
46
+ item,
47
+ preserving_proto_field_name=True,
48
+ use_integers_for_enums=False,
49
+ including_default_value_fields=False,
50
+ )
51
+ )
52
+ yield item
53
+
54
+
55
+ def get_report(
56
+ client: Resource,
57
+ property_id: int,
58
+ dimension_list: List[Dimension],
59
+ metric_list: List[Metric],
60
+ limit: int,
61
+ start_date: str,
62
+ end_date: str,
63
+ ) -> Iterator[TDataItem]:
64
+ """
65
+ Gets all the possible pages of reports with the given query parameters.
66
+ Processes every page and yields a dictionary for every row of the report.
67
+
68
+ Args:
69
+ client: The Google Analytics client used to make requests.
70
+ property_id: A reference to the Google Analytics project.
71
+ More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
72
+ dimension_list: A list of all the dimensions requested in the query.
73
+ metric_list: A list of all the metrics requested in the query.
74
+ limit: Describes how many rows there should be per page.
75
+ start_date: The starting date of the query.
76
+ end_date: The ending date of the query.
77
+
78
+ Yields:
79
+ Generator of all rows of data in the report.
80
+ """
81
+
82
+ request = RunReportRequest(
83
+ property=f"properties/{property_id}",
84
+ dimensions=dimension_list,
85
+ metrics=metric_list,
86
+ limit=limit,
87
+ date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
88
+ )
89
+ # process request
90
+ response = client.run_report(request)
91
+ processed_response_generator = process_report(response=response)
92
+ yield from processed_response_generator
93
+
94
+
95
+ def process_report(response: RunReportResponse) -> Iterator[TDataItems]:
96
+ """
97
+ Receives a single page for a report response, processes it, and returns a generator for every row of data in the report page.
98
+
99
+ Args:
100
+ response: The API response for a single page of the report.
101
+
102
+ Yields:
103
+ Generator of dictionaries for every row of the report page.
104
+ """
105
+
106
+ metrics_headers = [header.name for header in response.metric_headers]
107
+ dimensions_headers = [header.name for header in response.dimension_headers]
108
+
109
+ distinct_key_combinations = {}
110
+
111
+ for row in response.rows:
112
+ response_dict: DictStrAny = {
113
+ dimension_header: _resolve_dimension_value(
114
+ dimension_header, dimension_value.value
115
+ )
116
+ for dimension_header, dimension_value in zip(
117
+ dimensions_headers, row.dimension_values
118
+ )
119
+ }
120
+
121
+ for i in range(len(metrics_headers)):
122
+ # get metric type and process the value depending on type. Save metric name including type as well for the columns
123
+ metric_type = response.metric_headers[i].type_
124
+ metric_value = process_metric_value(
125
+ metric_type=metric_type, value=row.metric_values[i].value
126
+ )
127
+ response_dict[metrics_headers[i]] = metric_value
128
+
129
+ unique_key = "-".join(list(response_dict.keys()))
130
+ if unique_key not in distinct_key_combinations:
131
+ distinct_key_combinations[unique_key] = True
132
+
133
+ yield response_dict
134
+
135
+
136
+ def process_metric_value(metric_type: MetricType, value: str) -> Union[str, int, float]:
137
+ """
138
+ Processes the metric type, converts it from string to the correct type, and returns it.
139
+
140
+ Args:
141
+ metric_type: The type of the metric.
142
+ value: The value of the metric as a string.
143
+
144
+ Returns:
145
+ The given value converted to the correct data type.
146
+ """
147
+
148
+ # So far according to GA4 documentation these are the correct types: https://developers.google.com/analytics/devguides/reporting/data/v1/rest/v1beta/MetricType
149
+ # 0 for strings, 1 for ints and 2-12 are different types of floating points.
150
+ if metric_type.value == 0:
151
+ return value
152
+ elif metric_type.value == 1:
153
+ return int(value)
154
+ else:
155
+ return float(value)
156
+
157
+
158
+ def _resolve_dimension_value(dimension_name: str, dimension_value: str) -> Any:
159
+ """
160
+ Helper function that receives a dimension's name and value and converts it to a datetime object if needed.
161
+
162
+ Args:
163
+ dimension_name: Name of the dimension.
164
+ dimension_value: Value of the dimension.
165
+
166
+ Returns:
167
+ The value of the dimension with the correct data type.
168
+ """
169
+ if dimension_name == "date":
170
+ return pendulum.from_format(dimension_value, "YYYYMMDD", tz="UTC")
171
+ elif dimension_name == "dateHour":
172
+ return pendulum.from_format(dimension_value, "YYYYMMDDHH", tz="UTC")
173
+ elif dimension_name == "dateHourMinute":
174
+ return pendulum.from_format(dimension_value, "YYYYMMDDHHmm", tz="UTC")
175
+ else:
176
+ return dimension_value
ingestr/src/sources.py CHANGED
@@ -3,17 +3,44 @@ import csv
3
3
  import json
4
4
  import os
5
5
  import re
6
- from datetime import date
7
- from typing import Any, Callable, Optional
6
+ from datetime import date, datetime
7
+ from typing import (
8
+ Any,
9
+ Callable,
10
+ Dict,
11
+ Iterator,
12
+ List,
13
+ Literal,
14
+ Optional,
15
+ Union,
16
+ )
8
17
  from urllib.parse import ParseResult, parse_qs, quote, urlparse
9
18
 
10
19
  import dlt
11
20
  import pendulum
12
- from dlt.common.configuration.specs import AwsCredentials
21
+ import sqlalchemy
22
+ from dlt.common.configuration.specs import (
23
+ AwsCredentials,
24
+ )
25
+ from dlt.common.libs.sql_alchemy import (
26
+ Engine,
27
+ MetaData,
28
+ )
13
29
  from dlt.common.time import ensure_pendulum_datetime
14
- from dlt.common.typing import TSecretStrValue
15
- from dlt.sources.credentials import ConnectionStringCredentials
30
+ from dlt.common.typing import TDataItem, TSecretStrValue
31
+ from dlt.extract import Incremental
32
+ from dlt.sources.credentials import (
33
+ ConnectionStringCredentials,
34
+ )
16
35
  from dlt.sources.sql_database import sql_table
36
+ from dlt.sources.sql_database.helpers import TableLoader
37
+ from dlt.sources.sql_database.schema_types import (
38
+ ReflectionLevel,
39
+ SelectAny,
40
+ Table,
41
+ TTypeAdapter,
42
+ )
43
+ from sqlalchemy import Column
17
44
  from sqlalchemy import types as sa
18
45
  from sqlalchemy.dialects import mysql
19
46
 
@@ -28,6 +55,8 @@ from ingestr.src.dynamodb import dynamodb
28
55
  from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
29
56
  from ingestr.src.filesystem import readers
30
57
  from ingestr.src.filters import table_adapter_exclude_columns
58
+ from ingestr.src.github import github_reactions, github_repo_events, github_stargazers
59
+ from ingestr.src.google_analytics import google_analytics
31
60
  from ingestr.src.google_sheets import google_spreadsheet
32
61
  from ingestr.src.gorgias import gorgias_source
33
62
  from ingestr.src.hubspot import hubspot
@@ -39,7 +68,7 @@ from ingestr.src.notion import notion_databases
39
68
  from ingestr.src.shopify import shopify_source
40
69
  from ingestr.src.slack import slack_source
41
70
  from ingestr.src.stripe_analytics import stripe_source
42
- from ingestr.src.table_definition import table_string_to_dataclass
71
+ from ingestr.src.table_definition import TableDefinition, table_string_to_dataclass
43
72
  from ingestr.src.tiktok_ads import tiktok_source
44
73
  from ingestr.src.time import isotime
45
74
  from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
@@ -48,6 +77,9 @@ from ingestr.src.zendesk.helpers.credentials import (
48
77
  ZendeskCredentialsToken,
49
78
  )
50
79
 
80
+ TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
81
+ TQueryAdapter = Callable[[SelectAny, Table], SelectAny]
82
+
51
83
 
52
84
  class SqlSource:
53
85
  table_builder: Callable
@@ -59,16 +91,16 @@ class SqlSource:
59
91
  return False
60
92
 
61
93
  def dlt_source(self, uri: str, table: str, **kwargs):
62
- table_fields = table_string_to_dataclass(table)
94
+ table_fields = TableDefinition(dataset="custom", table="custom")
95
+ if not table.startswith("query:"):
96
+ table_fields = table_string_to_dataclass(table)
63
97
 
64
98
  incremental = None
65
99
  if kwargs.get("incremental_key"):
66
100
  start_value = kwargs.get("interval_start")
67
101
  end_value = kwargs.get("interval_end")
68
-
69
102
  incremental = dlt.sources.incremental(
70
103
  kwargs.get("incremental_key", ""),
71
- # primary_key=(),
72
104
  initial_value=start_value,
73
105
  end_value=end_value,
74
106
  )
@@ -87,6 +119,111 @@ class SqlSource:
87
119
  query = query.order_by(kwargs.get("incremental_key"))
88
120
  return query
89
121
 
122
+ defer_table_reflect = False
123
+ sql_backend = kwargs.get("sql_backend", "sqlalchemy")
124
+ if table.startswith("query:"):
125
+ if kwargs.get("sql_limit"):
126
+ raise ValueError(
127
+ "sql_limit is not supported for custom queries, please apply the limit in the query instead"
128
+ )
129
+
130
+ sql_backend = "sqlalchemy"
131
+ defer_table_reflect = True
132
+ query_value = table.split(":", 1)[1]
133
+
134
+ # this is a very hacky version of the table_rows function. it is built this way to go around the dlt's table loader.
135
+ # I didn't want to write a full fledged sqlalchemy source for now, and wanted to benefit from the existing stuff to begin with.
136
+ # this is by no means a production ready solution, but it works for now.
137
+ # the core idea behind this implementation is to create a mock table instance with the columns that are absolutely necessary for the incremental load to work.
138
+ # the table loader will then use the query adapter callback to apply the actual query and load the rows.
139
+ def table_rows(
140
+ engine: Engine,
141
+ table: Union[Table, str],
142
+ metadata: MetaData,
143
+ chunk_size: int,
144
+ backend: TableBackend,
145
+ incremental: Optional[Incremental[Any]] = None,
146
+ table_adapter_callback: Callable[[Table], None] = None, # type: ignore
147
+ reflection_level: ReflectionLevel = "minimal",
148
+ backend_kwargs: Dict[str, Any] = None, # type: ignore
149
+ type_adapter_callback: Optional[TTypeAdapter] = None,
150
+ included_columns: Optional[List[str]] = None,
151
+ query_adapter_callback: Optional[TQueryAdapter] = None,
152
+ resolve_foreign_keys: bool = False,
153
+ ) -> Iterator[TDataItem]:
154
+ hints = { # type: ignore
155
+ "columns": [],
156
+ }
157
+ cols = [] # type: ignore
158
+
159
+ if incremental:
160
+ switchDict = {
161
+ int: sa.INTEGER,
162
+ datetime: sa.TIMESTAMP,
163
+ date: sa.DATE,
164
+ pendulum.Date: sa.DATE,
165
+ pendulum.DateTime: sa.TIMESTAMP,
166
+ }
167
+
168
+ if incremental.last_value is not None:
169
+ cols.append(
170
+ Column(
171
+ incremental.cursor_path,
172
+ switchDict[type(incremental.last_value)], # type: ignore
173
+ )
174
+ )
175
+ else:
176
+ cols.append(Column(incremental.cursor_path, sa.TIMESTAMP)) # type: ignore
177
+
178
+ table = Table(
179
+ "query_result",
180
+ metadata,
181
+ *cols,
182
+ )
183
+
184
+ loader = TableLoader(
185
+ engine,
186
+ backend,
187
+ table,
188
+ hints["columns"], # type: ignore
189
+ incremental=incremental,
190
+ chunk_size=chunk_size,
191
+ query_adapter_callback=query_adapter_callback,
192
+ )
193
+ try:
194
+ yield from loader.load_rows(backend_kwargs)
195
+ finally:
196
+ if getattr(engine, "may_dispose_after_use", False):
197
+ engine.dispose()
198
+
199
+ dlt.sources.sql_database.table_rows = table_rows
200
+
201
+ def query_adapter_callback(query, table, incremental=None, engine=None):
202
+ params = {}
203
+ if incremental:
204
+ params["interval_start"] = (
205
+ incremental.last_value
206
+ if incremental.last_value is not None
207
+ else datetime(year=1, month=1, day=1)
208
+ )
209
+ if incremental.end_value is not None:
210
+ params["interval_end"] = incremental.end_value
211
+ else:
212
+ if ":interval_start" in query_value:
213
+ params["interval_start"] = (
214
+ datetime.min
215
+ if kwargs.get("interval_start") is None
216
+ else kwargs.get("interval_start")
217
+ )
218
+ if ":interval_end" in query_value:
219
+ params["interval_end"] = (
220
+ datetime.max
221
+ if kwargs.get("interval_end") is None
222
+ else kwargs.get("interval_end")
223
+ )
224
+
225
+ return sqlalchemy.text(query_value).bindparams(**params)
226
+
90
227
  def type_adapter_callback(sql_type):
91
228
  if isinstance(sql_type, mysql.SET):
92
229
  return sa.JSON
@@ -97,7 +234,7 @@ class SqlSource:
97
234
  schema=table_fields.dataset,
98
235
  table=table_fields.table,
99
236
  incremental=incremental,
100
- backend=kwargs.get("sql_backend", "sqlalchemy"),
237
+ backend=sql_backend,
101
238
  chunk_size=kwargs.get("page_size", None),
102
239
  reflection_level=reflection_level,
103
240
  query_adapter_callback=query_adapter_callback,
@@ -105,6 +242,7 @@ class SqlSource:
105
242
  table_adapter_callback=table_adapter_exclude_columns(
106
243
  kwargs.get("sql_exclude_columns", [])
107
244
  ),
245
+ defer_table_reflect=defer_table_reflect,
108
246
  )
109
247
 
110
248
  return builder_res
@@ -1015,29 +1153,28 @@ class TikTokSource:
1015
1153
  if not access_token:
1016
1154
  raise ValueError("access_token is required to connect to TikTok")
1017
1155
 
1018
- time_zone = source_fields.get("time_zone", "UTC")
1156
+ timezone = "UTC"
1157
+ if source_fields.get("timezone") is not None:
1158
+ timezone = source_fields.get("timezone")[0] # type: ignore
1019
1159
 
1020
- advertiser_id = source_fields.get("advertiser_id")
1021
- if not advertiser_id:
1022
- raise ValueError("advertiser_id is required to connect to TikTok")
1160
+ advertiser_ids = source_fields.get("advertiser_ids")
1161
+ if not advertiser_ids:
1162
+ raise ValueError("advertiser_ids is required to connect to TikTok")
1023
1163
 
1024
- start_date = pendulum.now().subtract(days=90).in_tz(time_zone[0])
1025
- end_date = ensure_pendulum_datetime(pendulum.now()).in_tz(time_zone[0])
1164
+ advertiser_ids = advertiser_ids[0].replace(" ", "").split(",")
1165
+
1166
+ start_date = pendulum.now().subtract(days=30).in_tz(timezone)
1167
+ end_date = ensure_pendulum_datetime(pendulum.now()).in_tz(timezone)
1026
1168
 
1027
1169
  interval_start = kwargs.get("interval_start")
1028
1170
  if interval_start is not None:
1029
- start_date = ensure_pendulum_datetime(interval_start).in_tz(time_zone[0])
1171
+ start_date = ensure_pendulum_datetime(interval_start).in_tz(timezone)
1030
1172
 
1031
1173
  interval_end = kwargs.get("interval_end")
1032
1174
  if interval_end is not None:
1033
- end_date = ensure_pendulum_datetime(interval_end).in_tz(time_zone[0])
1175
+ end_date = ensure_pendulum_datetime(interval_end).in_tz(timezone)
1034
1176
 
1035
- page_size = kwargs.get("page_size")
1036
- if page_size is not None and not isinstance(page_size, int):
1037
- page_size = int(page_size)
1038
-
1039
- if page_size > 1000:
1040
- page_size = 1000
1177
+ page_size = min(1000, kwargs.get("page_size", 1000))
1041
1178
 
1042
1179
  if table.startswith("custom:"):
1043
1180
  fields = table.split(":", 3)
@@ -1049,28 +1186,61 @@ class TikTokSource:
1049
1186
  dimensions = fields[1].replace(" ", "").split(",")
1050
1187
  if (
1051
1188
  "campaign_id" not in dimensions
1052
- and "advertiser_id" not in dimensions
1053
1189
  and "adgroup_id" not in dimensions
1054
1190
  and "ad_id" not in dimensions
1055
1191
  ):
1056
1192
  raise ValueError(
1057
- "You must provide one ID dimension. Please use one ID dimension from the following options: [campaign_id, advertiser_id, adgroup_id, ad_id]"
1193
+ "TikTok API requires at least one ID dimension, please use one of the following dimensions: [campaign_id, adgroup_id, ad_id]"
1058
1194
  )
1059
1195
 
1196
+ if "advertiser_id" in dimensions:
1197
+ dimensions.remove("advertiser_id")
1198
+
1060
1199
  metrics = fields[2].replace(" ", "").split(",")
1061
- filters = []
1200
+ filtering_param = False
1201
+ filter_name = ""
1202
+ filter_value = []
1062
1203
  if len(fields) == 4:
1063
- filters = fields[3].replace(" ", "").split(",")
1204
+
1205
+ def parse_filters(filters_raw: str) -> dict:
1206
+ # Parse filter string like "key1=value1,key2=value2,value3,value4"
1207
+ filters = {}
1208
+ current_key = None
1209
+
1210
+ for item in filters_raw.split(","):
1211
+ if "=" in item:
1212
+ # Start of a new key-value pair
1213
+ key, value = item.split("=")
1214
+ filters[key] = [value] # Always start with a list
1215
+ current_key = key
1216
+ elif current_key is not None:
1217
+ # Additional value for the current key
1218
+ filters[current_key].append(item)
1219
+
1220
+ # Convert single-item lists to simple values
1221
+ return {k: v[0] if len(v) == 1 else v for k, v in filters.items()}
1222
+
1223
+ filtering_param = True
1224
+ filters = parse_filters(fields[3])
1225
+ if len(filters) > 1:
1226
+ raise ValueError(
1227
+ "Only one filter is allowed for TikTok custom reports"
1228
+ )
1229
+ filter_name = list(filters.keys())[0]
1230
+ filter_value = list(map(int, filters[list(filters.keys())[0]]))
1231
+
1064
1232
  return tiktok_source(
1065
1233
  start_date=start_date,
1066
1234
  end_date=end_date,
1067
1235
  access_token=access_token[0],
1068
- advertiser_id=advertiser_id[0],
1069
- time_zone=time_zone[0],
1236
+ advertiser_ids=advertiser_ids,
1237
+ timezone=timezone,
1070
1238
  dimensions=dimensions,
1071
1239
  metrics=metrics,
1072
- filters=filters,
1073
1240
  page_size=page_size,
1241
+ filter_name=filter_name,
1242
+ filter_value=filter_value,
1243
+ filtering_param=filtering_param,
1074
1244
  ).with_resources(endpoint)
1075
1245
 
1076
1246
 
@@ -1171,3 +1341,103 @@ class DynamoDBSource:
1171
1341
  )
1172
1342
 
1173
1343
  return dynamodb(table, creds, incremental)
1344
+
1345
+
1346
+ class GoogleAnalyticsSource:
1347
+ def handles_incrementality(self) -> bool:
1348
+ return True
1349
+
1350
+ def dlt_source(self, uri: str, table: str, **kwargs):
1351
+ parse_uri = urlparse(uri)
1352
+ source_fields = parse_qs(parse_uri.query)
1353
+ cred_path = source_fields.get("credentials_path")
1354
+
1355
+ if not cred_path:
1356
+ raise ValueError("credentials_path is required to connect Google Analytics")
1357
+ credentials = {}
1358
+
1359
+ with open(cred_path[0], "r") as f:
1360
+ credentials = json.load(f)
1361
+
1362
+ property_id = source_fields.get("property_id")
1363
+ if not property_id:
1364
+ raise ValueError("property_id is required to connect to Google Analytics")
1365
+
1366
+ interval_start = kwargs.get("interval_start")
1367
+ start_date = (
1368
+ interval_start.strftime("%Y-%m-%d") if interval_start else "2015-08-14"
1369
+ )
1370
+
1371
+ fields = table.split(":")
1372
+ if len(fields) != 3:
1373
+ raise ValueError(
1374
+ "Invalid table format. Expected format: custom:<dimensions>:<metrics>"
1375
+ )
1376
+
1377
+ dimensions = fields[1].replace(" ", "").split(",")
1378
+
1379
+ datetime = ""
1380
+ for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
1381
+ if dimension_datetime in dimensions:
1382
+ datetime = dimension_datetime
1383
+ break
1384
+ else:
1385
+ raise ValueError(
1386
+ "You must provide at least one dimension: [dateHour, dateHourMinute, date]"
1387
+ )
1388
+
1389
+ metrics = fields[2].replace(" ", "").split(",")
1390
+ queries = [
1391
+ {"resource_name": "custom", "dimensions": dimensions, "metrics": metrics}
1392
+ ]
1393
+
1394
+ return google_analytics(
1395
+ property_id=property_id[0],
1396
+ start_date=start_date,
1397
+ datetime=datetime,
1398
+ queries=queries,
1399
+ credentials=credentials,
1400
+ ).with_resources("basic_report")
1401
+
1402
+
1403
+ class GitHubSource:
1404
+ def handles_incrementality(self) -> bool:
1405
+ return True
1406
+
1407
+ def dlt_source(self, uri: str, table: str, **kwargs):
1408
+ if kwargs.get("incremental_key"):
1409
+ raise ValueError(
1410
+ "Github takes care of incrementality on its own, you should not provide incremental_key"
1411
+ )
1412
+ # github://?access_token=<access_token>&owner=<owner>&repo=<repo>
1413
+ parsed_uri = urlparse(uri)
1414
+ source_fields = parse_qs(parsed_uri.query)
1415
+
1416
+ owner = source_fields.get("owner", [None])[0]
1417
+ if not owner:
1418
+ raise ValueError(
1419
+ "owner of the repository is required to connect with GitHub"
1420
+ )
1421
+
1422
+ repo = source_fields.get("repo", [None])[0]
1423
+ if not repo:
1424
+ raise ValueError(
1425
+ "repo variable is required to retrieve data for a specific repository from GitHub."
1426
+ )
1427
+
1428
+ access_token = source_fields.get("access_token", [None])[0]
1429
+ if not access_token and table not in ["repo_events"]:
1430
+ raise ValueError("access_token is required to connect with GitHub")
1431
+
1432
+ if table in ["issues", "pull_requests"]:
1433
+ return github_reactions(
1434
+ owner=owner, name=repo, access_token=access_token
1435
+ ).with_resources(table)
1436
+ elif table == "repo_events":
1437
+ return github_repo_events(owner=owner, name=repo, access_token=access_token)
1438
+ elif table == "stargazers":
1439
+ return github_stargazers(owner=owner, name=repo, access_token=access_token)
1440
+ else:
1441
+ raise ValueError(
1442
+ f"Resource '{table}' is not supported for GitHub source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1443
+ )