ingestr 0.12.2__py3-none-any.whl → 0.12.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +51 -4
- ingestr/src/factory.py +4 -0
- ingestr/src/github/__init__.py +149 -0
- ingestr/src/github/helpers.py +193 -0
- ingestr/src/github/queries.py +115 -0
- ingestr/src/github/settings.py +10 -0
- ingestr/src/google_analytics/__init__.py +70 -0
- ingestr/src/google_analytics/helpers/__init__.py +70 -0
- ingestr/src/google_analytics/helpers/data_processing.py +176 -0
- ingestr/src/sources.py +301 -31
- ingestr/src/tiktok_ads/__init__.py +72 -39
- ingestr/src/tiktok_ads/tiktok_helpers.py +32 -13
- ingestr/src/version.py +1 -1
- {ingestr-0.12.2.dist-info → ingestr-0.12.4.dist-info}/METADATA +2 -1
- {ingestr-0.12.2.dist-info → ingestr-0.12.4.dist-info}/RECORD +18 -11
- {ingestr-0.12.2.dist-info → ingestr-0.12.4.dist-info}/WHEEL +0 -0
- {ingestr-0.12.2.dist-info → ingestr-0.12.4.dist-info}/entry_points.txt +0 -0
- {ingestr-0.12.2.dist-info → ingestr-0.12.4.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Google analytics source helpers"""
|
|
2
|
+
|
|
3
|
+
from typing import Iterator, List
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from apiclient.discovery import Resource # type: ignore
|
|
7
|
+
from dlt.common import logger, pendulum
|
|
8
|
+
from dlt.common.typing import TDataItem
|
|
9
|
+
from google.analytics.data_v1beta.types import (
|
|
10
|
+
Dimension,
|
|
11
|
+
Metric,
|
|
12
|
+
)
|
|
13
|
+
from pendulum.datetime import DateTime
|
|
14
|
+
|
|
15
|
+
from .data_processing import get_report
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def basic_report(
|
|
19
|
+
client: Resource,
|
|
20
|
+
rows_per_page: int,
|
|
21
|
+
dimensions: List[str],
|
|
22
|
+
metrics: List[str],
|
|
23
|
+
property_id: int,
|
|
24
|
+
resource_name: str,
|
|
25
|
+
start_date: str,
|
|
26
|
+
last_date: dlt.sources.incremental[DateTime],
|
|
27
|
+
) -> Iterator[TDataItem]:
|
|
28
|
+
"""
|
|
29
|
+
Retrieves the data for a report given dimensions, metrics, and filters required for the report.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
client: The Google Analytics client used to make requests.
|
|
33
|
+
dimensions: Dimensions for the report. See metadata for the full list of dimensions.
|
|
34
|
+
metrics: Metrics for the report. See metadata for the full list of metrics.
|
|
35
|
+
property_id: A reference to the Google Analytics project.
|
|
36
|
+
More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
|
|
37
|
+
rows_per_page: Controls how many rows are retrieved per page in the reports.
|
|
38
|
+
Default is 10000, maximum possible is 100000.
|
|
39
|
+
resource_name: The resource name used to save incremental into dlt state.
|
|
40
|
+
start_date: Incremental load start_date.
|
|
41
|
+
Default is taken from dlt state if it exists.
|
|
42
|
+
last_date: Incremental load end date.
|
|
43
|
+
Default is taken from dlt state if it exists.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Generator of all rows of data in the report.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
# grab the start time from last dlt load if not filled, if that is also empty then use the first day of the millennium as the start time instead
|
|
50
|
+
if last_date.last_value:
|
|
51
|
+
if start_date != "2015-08-14":
|
|
52
|
+
logger.warning(
|
|
53
|
+
f"Using the starting date: {last_date.last_value} for incremental report: {resource_name} and ignoring start date passed as argument {start_date}"
|
|
54
|
+
)
|
|
55
|
+
start_date = last_date.last_value.to_date_string()
|
|
56
|
+
else:
|
|
57
|
+
start_date = start_date or "2015-08-14"
|
|
58
|
+
|
|
59
|
+
processed_response = get_report(
|
|
60
|
+
client=client,
|
|
61
|
+
property_id=property_id,
|
|
62
|
+
# fill dimensions and metrics with the proper api client objects
|
|
63
|
+
dimension_list=[Dimension(name=dimension) for dimension in dimensions],
|
|
64
|
+
metric_list=[Metric(name=metric) for metric in metrics],
|
|
65
|
+
limit=rows_per_page,
|
|
66
|
+
start_date=start_date,
|
|
67
|
+
# configure end_date to yesterday as a date string
|
|
68
|
+
end_date=pendulum.now().to_date_string(),
|
|
69
|
+
)
|
|
70
|
+
yield from processed_response
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module contains helpers that process data and make it ready for loading into the database
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Iterator, List, Union
|
|
7
|
+
|
|
8
|
+
import proto
|
|
9
|
+
from dlt.common.exceptions import MissingDependencyException
|
|
10
|
+
from dlt.common.pendulum import pendulum
|
|
11
|
+
from dlt.common.typing import DictStrAny, TDataItem, TDataItems
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from google.analytics.data_v1beta import BetaAnalyticsDataClient # noqa: F401
|
|
15
|
+
from google.analytics.data_v1beta.types import (
|
|
16
|
+
DateRange,
|
|
17
|
+
Dimension,
|
|
18
|
+
DimensionExpression, # noqa: F401
|
|
19
|
+
DimensionMetadata, # noqa: F401
|
|
20
|
+
GetMetadataRequest, # noqa: F401
|
|
21
|
+
Metadata, # noqa: F401
|
|
22
|
+
Metric,
|
|
23
|
+
MetricMetadata, # noqa: F401
|
|
24
|
+
MetricType,
|
|
25
|
+
RunReportRequest,
|
|
26
|
+
RunReportResponse,
|
|
27
|
+
)
|
|
28
|
+
except ImportError:
|
|
29
|
+
raise MissingDependencyException(
|
|
30
|
+
"Google Analytics API Client", ["google-analytics-data"]
|
|
31
|
+
)
|
|
32
|
+
try:
|
|
33
|
+
from apiclient.discovery import Resource, build # type: ignore # noqa: F401
|
|
34
|
+
except ImportError:
|
|
35
|
+
raise MissingDependencyException("Google API Client", ["google-api-python-client"])
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def to_dict(item: Any) -> Iterator[TDataItem]:
|
|
39
|
+
"""
|
|
40
|
+
Processes a batch result (page of results per dimension) accordingly
|
|
41
|
+
:param batch:
|
|
42
|
+
:return:
|
|
43
|
+
"""
|
|
44
|
+
item = json.loads(
|
|
45
|
+
proto.Message.to_json(
|
|
46
|
+
item,
|
|
47
|
+
preserving_proto_field_name=True,
|
|
48
|
+
use_integers_for_enums=False,
|
|
49
|
+
including_default_value_fields=False,
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
yield item
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_report(
|
|
56
|
+
client: Resource,
|
|
57
|
+
property_id: int,
|
|
58
|
+
dimension_list: List[Dimension],
|
|
59
|
+
metric_list: List[Metric],
|
|
60
|
+
limit: int,
|
|
61
|
+
start_date: str,
|
|
62
|
+
end_date: str,
|
|
63
|
+
) -> Iterator[TDataItem]:
|
|
64
|
+
"""
|
|
65
|
+
Gets all the possible pages of reports with the given query parameters.
|
|
66
|
+
Processes every page and yields a dictionary for every row of the report.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
client: The Google Analytics client used to make requests.
|
|
70
|
+
property_id: A reference to the Google Analytics project.
|
|
71
|
+
More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
|
|
72
|
+
dimension_list: A list of all the dimensions requested in the query.
|
|
73
|
+
metric_list: A list of all the metrics requested in the query.
|
|
74
|
+
limit: Describes how many rows there should be per page.
|
|
75
|
+
start_date: The starting date of the query.
|
|
76
|
+
end_date: The ending date of the query.
|
|
77
|
+
|
|
78
|
+
Yields:
|
|
79
|
+
Generator of all rows of data in the report.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
request = RunReportRequest(
|
|
83
|
+
property=f"properties/{property_id}",
|
|
84
|
+
dimensions=dimension_list,
|
|
85
|
+
metrics=metric_list,
|
|
86
|
+
limit=limit,
|
|
87
|
+
date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
|
|
88
|
+
)
|
|
89
|
+
# process request
|
|
90
|
+
response = client.run_report(request)
|
|
91
|
+
processed_response_generator = process_report(response=response)
|
|
92
|
+
yield from processed_response_generator
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def process_report(response: RunReportResponse) -> Iterator[TDataItems]:
|
|
96
|
+
"""
|
|
97
|
+
Receives a single page for a report response, processes it, and returns a generator for every row of data in the report page.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
response: The API response for a single page of the report.
|
|
101
|
+
|
|
102
|
+
Yields:
|
|
103
|
+
Generator of dictionaries for every row of the report page.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
metrics_headers = [header.name for header in response.metric_headers]
|
|
107
|
+
dimensions_headers = [header.name for header in response.dimension_headers]
|
|
108
|
+
|
|
109
|
+
distinct_key_combinations = {}
|
|
110
|
+
|
|
111
|
+
for row in response.rows:
|
|
112
|
+
response_dict: DictStrAny = {
|
|
113
|
+
dimension_header: _resolve_dimension_value(
|
|
114
|
+
dimension_header, dimension_value.value
|
|
115
|
+
)
|
|
116
|
+
for dimension_header, dimension_value in zip(
|
|
117
|
+
dimensions_headers, row.dimension_values
|
|
118
|
+
)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
for i in range(len(metrics_headers)):
|
|
122
|
+
# get metric type and process the value depending on type. Save metric name including type as well for the columns
|
|
123
|
+
metric_type = response.metric_headers[i].type_
|
|
124
|
+
metric_value = process_metric_value(
|
|
125
|
+
metric_type=metric_type, value=row.metric_values[i].value
|
|
126
|
+
)
|
|
127
|
+
response_dict[metrics_headers[i]] = metric_value
|
|
128
|
+
|
|
129
|
+
unique_key = "-".join(list(response_dict.keys()))
|
|
130
|
+
if unique_key not in distinct_key_combinations:
|
|
131
|
+
distinct_key_combinations[unique_key] = True
|
|
132
|
+
|
|
133
|
+
yield response_dict
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def process_metric_value(metric_type: MetricType, value: str) -> Union[str, int, float]:
|
|
137
|
+
"""
|
|
138
|
+
Processes the metric type, converts it from string to the correct type, and returns it.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
metric_type: The type of the metric.
|
|
142
|
+
value: The value of the metric as a string.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
The given value converted to the correct data type.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
# So far according to GA4 documentation these are the correct types: https://developers.google.com/analytics/devguides/reporting/data/v1/rest/v1beta/MetricType
|
|
149
|
+
# 0 for strings, 1 for ints and 2-12 are different types of floating points.
|
|
150
|
+
if metric_type.value == 0:
|
|
151
|
+
return value
|
|
152
|
+
elif metric_type.value == 1:
|
|
153
|
+
return int(value)
|
|
154
|
+
else:
|
|
155
|
+
return float(value)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _resolve_dimension_value(dimension_name: str, dimension_value: str) -> Any:
|
|
159
|
+
"""
|
|
160
|
+
Helper function that receives a dimension's name and value and converts it to a datetime object if needed.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
dimension_name: Name of the dimension.
|
|
164
|
+
dimension_value: Value of the dimension.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
The value of the dimension with the correct data type.
|
|
168
|
+
"""
|
|
169
|
+
if dimension_name == "date":
|
|
170
|
+
return pendulum.from_format(dimension_value, "YYYYMMDD", tz="UTC")
|
|
171
|
+
elif dimension_name == "dateHour":
|
|
172
|
+
return pendulum.from_format(dimension_value, "YYYYMMDDHH", tz="UTC")
|
|
173
|
+
elif dimension_name == "dateHourMinute":
|
|
174
|
+
return pendulum.from_format(dimension_value, "YYYYMMDDHHmm", tz="UTC")
|
|
175
|
+
else:
|
|
176
|
+
return dimension_value
|
ingestr/src/sources.py
CHANGED
|
@@ -3,17 +3,44 @@ import csv
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import re
|
|
6
|
-
from datetime import date
|
|
7
|
-
from typing import
|
|
6
|
+
from datetime import date, datetime
|
|
7
|
+
from typing import (
|
|
8
|
+
Any,
|
|
9
|
+
Callable,
|
|
10
|
+
Dict,
|
|
11
|
+
Iterator,
|
|
12
|
+
List,
|
|
13
|
+
Literal,
|
|
14
|
+
Optional,
|
|
15
|
+
Union,
|
|
16
|
+
)
|
|
8
17
|
from urllib.parse import ParseResult, parse_qs, quote, urlparse
|
|
9
18
|
|
|
10
19
|
import dlt
|
|
11
20
|
import pendulum
|
|
12
|
-
|
|
21
|
+
import sqlalchemy
|
|
22
|
+
from dlt.common.configuration.specs import (
|
|
23
|
+
AwsCredentials,
|
|
24
|
+
)
|
|
25
|
+
from dlt.common.libs.sql_alchemy import (
|
|
26
|
+
Engine,
|
|
27
|
+
MetaData,
|
|
28
|
+
)
|
|
13
29
|
from dlt.common.time import ensure_pendulum_datetime
|
|
14
|
-
from dlt.common.typing import TSecretStrValue
|
|
15
|
-
from dlt.
|
|
30
|
+
from dlt.common.typing import TDataItem, TSecretStrValue
|
|
31
|
+
from dlt.extract import Incremental
|
|
32
|
+
from dlt.sources.credentials import (
|
|
33
|
+
ConnectionStringCredentials,
|
|
34
|
+
)
|
|
16
35
|
from dlt.sources.sql_database import sql_table
|
|
36
|
+
from dlt.sources.sql_database.helpers import TableLoader
|
|
37
|
+
from dlt.sources.sql_database.schema_types import (
|
|
38
|
+
ReflectionLevel,
|
|
39
|
+
SelectAny,
|
|
40
|
+
Table,
|
|
41
|
+
TTypeAdapter,
|
|
42
|
+
)
|
|
43
|
+
from sqlalchemy import Column
|
|
17
44
|
from sqlalchemy import types as sa
|
|
18
45
|
from sqlalchemy.dialects import mysql
|
|
19
46
|
|
|
@@ -28,6 +55,8 @@ from ingestr.src.dynamodb import dynamodb
|
|
|
28
55
|
from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
|
|
29
56
|
from ingestr.src.filesystem import readers
|
|
30
57
|
from ingestr.src.filters import table_adapter_exclude_columns
|
|
58
|
+
from ingestr.src.github import github_reactions, github_repo_events, github_stargazers
|
|
59
|
+
from ingestr.src.google_analytics import google_analytics
|
|
31
60
|
from ingestr.src.google_sheets import google_spreadsheet
|
|
32
61
|
from ingestr.src.gorgias import gorgias_source
|
|
33
62
|
from ingestr.src.hubspot import hubspot
|
|
@@ -39,7 +68,7 @@ from ingestr.src.notion import notion_databases
|
|
|
39
68
|
from ingestr.src.shopify import shopify_source
|
|
40
69
|
from ingestr.src.slack import slack_source
|
|
41
70
|
from ingestr.src.stripe_analytics import stripe_source
|
|
42
|
-
from ingestr.src.table_definition import table_string_to_dataclass
|
|
71
|
+
from ingestr.src.table_definition import TableDefinition, table_string_to_dataclass
|
|
43
72
|
from ingestr.src.tiktok_ads import tiktok_source
|
|
44
73
|
from ingestr.src.time import isotime
|
|
45
74
|
from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
|
|
@@ -48,6 +77,9 @@ from ingestr.src.zendesk.helpers.credentials import (
|
|
|
48
77
|
ZendeskCredentialsToken,
|
|
49
78
|
)
|
|
50
79
|
|
|
80
|
+
TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
|
|
81
|
+
TQueryAdapter = Callable[[SelectAny, Table], SelectAny]
|
|
82
|
+
|
|
51
83
|
|
|
52
84
|
class SqlSource:
|
|
53
85
|
table_builder: Callable
|
|
@@ -59,16 +91,16 @@ class SqlSource:
|
|
|
59
91
|
return False
|
|
60
92
|
|
|
61
93
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
62
|
-
table_fields =
|
|
94
|
+
table_fields = TableDefinition(dataset="custom", table="custom")
|
|
95
|
+
if not table.startswith("query:"):
|
|
96
|
+
table_fields = table_string_to_dataclass(table)
|
|
63
97
|
|
|
64
98
|
incremental = None
|
|
65
99
|
if kwargs.get("incremental_key"):
|
|
66
100
|
start_value = kwargs.get("interval_start")
|
|
67
101
|
end_value = kwargs.get("interval_end")
|
|
68
|
-
|
|
69
102
|
incremental = dlt.sources.incremental(
|
|
70
103
|
kwargs.get("incremental_key", ""),
|
|
71
|
-
# primary_key=(),
|
|
72
104
|
initial_value=start_value,
|
|
73
105
|
end_value=end_value,
|
|
74
106
|
)
|
|
@@ -87,6 +119,111 @@ class SqlSource:
|
|
|
87
119
|
query = query.order_by(kwargs.get("incremental_key"))
|
|
88
120
|
return query
|
|
89
121
|
|
|
122
|
+
defer_table_reflect = False
|
|
123
|
+
sql_backend = kwargs.get("sql_backend", "sqlalchemy")
|
|
124
|
+
if table.startswith("query:"):
|
|
125
|
+
if kwargs.get("sql_limit"):
|
|
126
|
+
raise ValueError(
|
|
127
|
+
"sql_limit is not supported for custom queries, please apply the limit in the query instead"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
sql_backend = "sqlalchemy"
|
|
131
|
+
defer_table_reflect = True
|
|
132
|
+
query_value = table.split(":", 1)[1]
|
|
133
|
+
|
|
134
|
+
# this is a very hacky version of the table_rows function. it is built this way to go around the dlt's table loader.
|
|
135
|
+
# I didn't want to write a full fledged sqlalchemy source for now, and wanted to benefit from the existing stuff to begin with.
|
|
136
|
+
# this is by no means a production ready solution, but it works for now.
|
|
137
|
+
# the core idea behind this implementation is to create a mock table instance with the columns that are absolutely necessary for the incremental load to work.
|
|
138
|
+
# the table loader will then use the query adapter callback to apply the actual query and load the rows.
|
|
139
|
+
def table_rows(
|
|
140
|
+
engine: Engine,
|
|
141
|
+
table: Union[Table, str],
|
|
142
|
+
metadata: MetaData,
|
|
143
|
+
chunk_size: int,
|
|
144
|
+
backend: TableBackend,
|
|
145
|
+
incremental: Optional[Incremental[Any]] = None,
|
|
146
|
+
table_adapter_callback: Callable[[Table], None] = None, # type: ignore
|
|
147
|
+
reflection_level: ReflectionLevel = "minimal",
|
|
148
|
+
backend_kwargs: Dict[str, Any] = None, # type: ignore
|
|
149
|
+
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
150
|
+
included_columns: Optional[List[str]] = None,
|
|
151
|
+
query_adapter_callback: Optional[TQueryAdapter] = None,
|
|
152
|
+
resolve_foreign_keys: bool = False,
|
|
153
|
+
) -> Iterator[TDataItem]:
|
|
154
|
+
hints = { # type: ignore
|
|
155
|
+
"columns": [],
|
|
156
|
+
}
|
|
157
|
+
cols = [] # type: ignore
|
|
158
|
+
|
|
159
|
+
if incremental:
|
|
160
|
+
switchDict = {
|
|
161
|
+
int: sa.INTEGER,
|
|
162
|
+
datetime: sa.TIMESTAMP,
|
|
163
|
+
date: sa.DATE,
|
|
164
|
+
pendulum.Date: sa.DATE,
|
|
165
|
+
pendulum.DateTime: sa.TIMESTAMP,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if incremental.last_value is not None:
|
|
169
|
+
cols.append(
|
|
170
|
+
Column(
|
|
171
|
+
incremental.cursor_path,
|
|
172
|
+
switchDict[type(incremental.last_value)], # type: ignore
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
cols.append(Column(incremental.cursor_path, sa.TIMESTAMP)) # type: ignore
|
|
177
|
+
|
|
178
|
+
table = Table(
|
|
179
|
+
"query_result",
|
|
180
|
+
metadata,
|
|
181
|
+
*cols,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
loader = TableLoader(
|
|
185
|
+
engine,
|
|
186
|
+
backend,
|
|
187
|
+
table,
|
|
188
|
+
hints["columns"], # type: ignore
|
|
189
|
+
incremental=incremental,
|
|
190
|
+
chunk_size=chunk_size,
|
|
191
|
+
query_adapter_callback=query_adapter_callback,
|
|
192
|
+
)
|
|
193
|
+
try:
|
|
194
|
+
yield from loader.load_rows(backend_kwargs)
|
|
195
|
+
finally:
|
|
196
|
+
if getattr(engine, "may_dispose_after_use", False):
|
|
197
|
+
engine.dispose()
|
|
198
|
+
|
|
199
|
+
dlt.sources.sql_database.table_rows = table_rows
|
|
200
|
+
|
|
201
|
+
def query_adapter_callback(query, table, incremental=None, engine=None):
|
|
202
|
+
params = {}
|
|
203
|
+
if incremental:
|
|
204
|
+
params["interval_start"] = (
|
|
205
|
+
incremental.last_value
|
|
206
|
+
if incremental.last_value is not None
|
|
207
|
+
else datetime(year=1, month=1, day=1)
|
|
208
|
+
)
|
|
209
|
+
if incremental.end_value is not None:
|
|
210
|
+
params["interval_end"] = incremental.end_value
|
|
211
|
+
else:
|
|
212
|
+
if ":interval_start" in query_value:
|
|
213
|
+
params["interval_start"] = (
|
|
214
|
+
datetime.min
|
|
215
|
+
if kwargs.get("interval_start") is None
|
|
216
|
+
else kwargs.get("interval_start")
|
|
217
|
+
)
|
|
218
|
+
if ":interval_end" in query_value:
|
|
219
|
+
params["interval_end"] = (
|
|
220
|
+
datetime.max
|
|
221
|
+
if kwargs.get("interval_end") is None
|
|
222
|
+
else kwargs.get("interval_end")
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return sqlalchemy.text(query_value).bindparams(**params)
|
|
226
|
+
|
|
90
227
|
def type_adapter_callback(sql_type):
|
|
91
228
|
if isinstance(sql_type, mysql.SET):
|
|
92
229
|
return sa.JSON
|
|
@@ -97,7 +234,7 @@ class SqlSource:
|
|
|
97
234
|
schema=table_fields.dataset,
|
|
98
235
|
table=table_fields.table,
|
|
99
236
|
incremental=incremental,
|
|
100
|
-
backend=
|
|
237
|
+
backend=sql_backend,
|
|
101
238
|
chunk_size=kwargs.get("page_size", None),
|
|
102
239
|
reflection_level=reflection_level,
|
|
103
240
|
query_adapter_callback=query_adapter_callback,
|
|
@@ -105,6 +242,7 @@ class SqlSource:
|
|
|
105
242
|
table_adapter_callback=table_adapter_exclude_columns(
|
|
106
243
|
kwargs.get("sql_exclude_columns", [])
|
|
107
244
|
),
|
|
245
|
+
defer_table_reflect=defer_table_reflect,
|
|
108
246
|
)
|
|
109
247
|
|
|
110
248
|
return builder_res
|
|
@@ -1015,29 +1153,28 @@ class TikTokSource:
|
|
|
1015
1153
|
if not access_token:
|
|
1016
1154
|
raise ValueError("access_token is required to connect to TikTok")
|
|
1017
1155
|
|
|
1018
|
-
|
|
1156
|
+
timezone = "UTC"
|
|
1157
|
+
if source_fields.get("timezone") is not None:
|
|
1158
|
+
timezone = source_fields.get("timezone")[0] # type: ignore
|
|
1019
1159
|
|
|
1020
|
-
|
|
1021
|
-
if not
|
|
1022
|
-
raise ValueError("
|
|
1160
|
+
advertiser_ids = source_fields.get("advertiser_ids")
|
|
1161
|
+
if not advertiser_ids:
|
|
1162
|
+
raise ValueError("advertiser_ids is required to connect to TikTok")
|
|
1023
1163
|
|
|
1024
|
-
|
|
1025
|
-
|
|
1164
|
+
advertiser_ids = advertiser_ids[0].replace(" ", "").split(",")
|
|
1165
|
+
|
|
1166
|
+
start_date = pendulum.now().subtract(days=30).in_tz(timezone)
|
|
1167
|
+
end_date = ensure_pendulum_datetime(pendulum.now()).in_tz(timezone)
|
|
1026
1168
|
|
|
1027
1169
|
interval_start = kwargs.get("interval_start")
|
|
1028
1170
|
if interval_start is not None:
|
|
1029
|
-
start_date = ensure_pendulum_datetime(interval_start).in_tz(
|
|
1171
|
+
start_date = ensure_pendulum_datetime(interval_start).in_tz(timezone)
|
|
1030
1172
|
|
|
1031
1173
|
interval_end = kwargs.get("interval_end")
|
|
1032
1174
|
if interval_end is not None:
|
|
1033
|
-
end_date = ensure_pendulum_datetime(interval_end).in_tz(
|
|
1175
|
+
end_date = ensure_pendulum_datetime(interval_end).in_tz(timezone)
|
|
1034
1176
|
|
|
1035
|
-
page_size = kwargs.get("page_size")
|
|
1036
|
-
if page_size is not None and not isinstance(page_size, int):
|
|
1037
|
-
page_size = int(page_size)
|
|
1038
|
-
|
|
1039
|
-
if page_size > 1000:
|
|
1040
|
-
page_size = 1000
|
|
1177
|
+
page_size = min(1000, kwargs.get("page_size", 1000))
|
|
1041
1178
|
|
|
1042
1179
|
if table.startswith("custom:"):
|
|
1043
1180
|
fields = table.split(":", 3)
|
|
@@ -1049,28 +1186,61 @@ class TikTokSource:
|
|
|
1049
1186
|
dimensions = fields[1].replace(" ", "").split(",")
|
|
1050
1187
|
if (
|
|
1051
1188
|
"campaign_id" not in dimensions
|
|
1052
|
-
and "advertiser_id" not in dimensions
|
|
1053
1189
|
and "adgroup_id" not in dimensions
|
|
1054
1190
|
and "ad_id" not in dimensions
|
|
1055
1191
|
):
|
|
1056
1192
|
raise ValueError(
|
|
1057
|
-
"
|
|
1193
|
+
"TikTok API requires at least one ID dimension, please use one of the following dimensions: [campaign_id, adgroup_id, ad_id]"
|
|
1058
1194
|
)
|
|
1059
1195
|
|
|
1196
|
+
if "advertiser_id" in dimensions:
|
|
1197
|
+
dimensions.remove("advertiser_id")
|
|
1198
|
+
|
|
1060
1199
|
metrics = fields[2].replace(" ", "").split(",")
|
|
1061
|
-
|
|
1200
|
+
filtering_param = False
|
|
1201
|
+
filter_name = ""
|
|
1202
|
+
filter_value = []
|
|
1062
1203
|
if len(fields) == 4:
|
|
1063
|
-
|
|
1204
|
+
|
|
1205
|
+
def parse_filters(filters_raw: str) -> dict:
|
|
1206
|
+
# Parse filter string like "key1=value1,key2=value2,value3,value4"
|
|
1207
|
+
filters = {}
|
|
1208
|
+
current_key = None
|
|
1209
|
+
|
|
1210
|
+
for item in filters_raw.split(","):
|
|
1211
|
+
if "=" in item:
|
|
1212
|
+
# Start of a new key-value pair
|
|
1213
|
+
key, value = item.split("=")
|
|
1214
|
+
filters[key] = [value] # Always start with a list
|
|
1215
|
+
current_key = key
|
|
1216
|
+
elif current_key is not None:
|
|
1217
|
+
# Additional value for the current key
|
|
1218
|
+
filters[current_key].append(item)
|
|
1219
|
+
|
|
1220
|
+
# Convert single-item lists to simple values
|
|
1221
|
+
return {k: v[0] if len(v) == 1 else v for k, v in filters.items()}
|
|
1222
|
+
|
|
1223
|
+
filtering_param = True
|
|
1224
|
+
filters = parse_filters(fields[3])
|
|
1225
|
+
if len(filters) > 1:
|
|
1226
|
+
raise ValueError(
|
|
1227
|
+
"Only one filter is allowed for TikTok custom reports"
|
|
1228
|
+
)
|
|
1229
|
+
filter_name = list(filters.keys())[0]
|
|
1230
|
+
filter_value = list(map(int, filters[list(filters.keys())[0]]))
|
|
1231
|
+
|
|
1064
1232
|
return tiktok_source(
|
|
1065
1233
|
start_date=start_date,
|
|
1066
1234
|
end_date=end_date,
|
|
1067
1235
|
access_token=access_token[0],
|
|
1068
|
-
|
|
1069
|
-
|
|
1236
|
+
advertiser_ids=advertiser_ids,
|
|
1237
|
+
timezone=timezone,
|
|
1070
1238
|
dimensions=dimensions,
|
|
1071
1239
|
metrics=metrics,
|
|
1072
|
-
filters=filters,
|
|
1073
1240
|
page_size=page_size,
|
|
1241
|
+
filter_name=filter_name,
|
|
1242
|
+
filter_value=filter_value,
|
|
1243
|
+
filtering_param=filtering_param,
|
|
1074
1244
|
).with_resources(endpoint)
|
|
1075
1245
|
|
|
1076
1246
|
|
|
@@ -1171,3 +1341,103 @@ class DynamoDBSource:
|
|
|
1171
1341
|
)
|
|
1172
1342
|
|
|
1173
1343
|
return dynamodb(table, creds, incremental)
|
|
1344
|
+
|
|
1345
|
+
|
|
1346
|
+
class GoogleAnalyticsSource:
|
|
1347
|
+
def handles_incrementality(self) -> bool:
|
|
1348
|
+
return True
|
|
1349
|
+
|
|
1350
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1351
|
+
parse_uri = urlparse(uri)
|
|
1352
|
+
source_fields = parse_qs(parse_uri.query)
|
|
1353
|
+
cred_path = source_fields.get("credentials_path")
|
|
1354
|
+
|
|
1355
|
+
if not cred_path:
|
|
1356
|
+
raise ValueError("credentials_path is required to connect Google Analytics")
|
|
1357
|
+
credentials = {}
|
|
1358
|
+
|
|
1359
|
+
with open(cred_path[0], "r") as f:
|
|
1360
|
+
credentials = json.load(f)
|
|
1361
|
+
|
|
1362
|
+
property_id = source_fields.get("property_id")
|
|
1363
|
+
if not property_id:
|
|
1364
|
+
raise ValueError("property_id is required to connect to Google Analytics")
|
|
1365
|
+
|
|
1366
|
+
interval_start = kwargs.get("interval_start")
|
|
1367
|
+
start_date = (
|
|
1368
|
+
interval_start.strftime("%Y-%m-%d") if interval_start else "2015-08-14"
|
|
1369
|
+
)
|
|
1370
|
+
|
|
1371
|
+
fields = table.split(":")
|
|
1372
|
+
if len(fields) != 3:
|
|
1373
|
+
raise ValueError(
|
|
1374
|
+
"Invalid table format. Expected format: custom:<dimensions>:<metrics>"
|
|
1375
|
+
)
|
|
1376
|
+
|
|
1377
|
+
dimensions = fields[1].replace(" ", "").split(",")
|
|
1378
|
+
|
|
1379
|
+
datetime = ""
|
|
1380
|
+
for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
|
|
1381
|
+
if dimension_datetime in dimensions:
|
|
1382
|
+
datetime = dimension_datetime
|
|
1383
|
+
break
|
|
1384
|
+
else:
|
|
1385
|
+
raise ValueError(
|
|
1386
|
+
"You must provide at least one dimension: [dateHour, dateHourMinute, date]"
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
metrics = fields[2].replace(" ", "").split(",")
|
|
1390
|
+
queries = [
|
|
1391
|
+
{"resource_name": "custom", "dimensions": dimensions, "metrics": metrics}
|
|
1392
|
+
]
|
|
1393
|
+
|
|
1394
|
+
return google_analytics(
|
|
1395
|
+
property_id=property_id[0],
|
|
1396
|
+
start_date=start_date,
|
|
1397
|
+
datetime=datetime,
|
|
1398
|
+
queries=queries,
|
|
1399
|
+
credentials=credentials,
|
|
1400
|
+
).with_resources("basic_report")
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
class GitHubSource:
|
|
1404
|
+
def handles_incrementality(self) -> bool:
|
|
1405
|
+
return True
|
|
1406
|
+
|
|
1407
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1408
|
+
if kwargs.get("incremental_key"):
|
|
1409
|
+
raise ValueError(
|
|
1410
|
+
"Github takes care of incrementality on its own, you should not provide incremental_key"
|
|
1411
|
+
)
|
|
1412
|
+
# github://?access_token=<access_token>&owner=<owner>&repo=<repo>
|
|
1413
|
+
parsed_uri = urlparse(uri)
|
|
1414
|
+
source_fields = parse_qs(parsed_uri.query)
|
|
1415
|
+
|
|
1416
|
+
owner = source_fields.get("owner", [None])[0]
|
|
1417
|
+
if not owner:
|
|
1418
|
+
raise ValueError(
|
|
1419
|
+
"owner of the repository is required to connect with GitHub"
|
|
1420
|
+
)
|
|
1421
|
+
|
|
1422
|
+
repo = source_fields.get("repo", [None])[0]
|
|
1423
|
+
if not repo:
|
|
1424
|
+
raise ValueError(
|
|
1425
|
+
"repo variable is required to retrieve data for a specific repository from GitHub."
|
|
1426
|
+
)
|
|
1427
|
+
|
|
1428
|
+
access_token = source_fields.get("access_token", [None])[0]
|
|
1429
|
+
if not access_token and table not in ["repo_events"]:
|
|
1430
|
+
raise ValueError("access_token is required to connect with GitHub")
|
|
1431
|
+
|
|
1432
|
+
if table in ["issues", "pull_requests"]:
|
|
1433
|
+
return github_reactions(
|
|
1434
|
+
owner=owner, name=repo, access_token=access_token
|
|
1435
|
+
).with_resources(table)
|
|
1436
|
+
elif table == "repo_events":
|
|
1437
|
+
return github_repo_events(owner=owner, name=repo, access_token=access_token)
|
|
1438
|
+
elif table == "stargazers":
|
|
1439
|
+
return github_stargazers(owner=owner, name=repo, access_token=access_token)
|
|
1440
|
+
else:
|
|
1441
|
+
raise ValueError(
|
|
1442
|
+
f"Resource '{table}' is not supported for GitHub source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1443
|
+
)
|