ingestr 0.12.3__py3-none-any.whl → 0.12.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +53 -5
- ingestr/src/arrow/__init__.py +0 -4
- ingestr/src/factory.py +4 -0
- ingestr/src/github/__init__.py +149 -0
- ingestr/src/github/helpers.py +193 -0
- ingestr/src/github/queries.py +115 -0
- ingestr/src/github/settings.py +10 -0
- ingestr/src/google_analytics/__init__.py +70 -0
- ingestr/src/google_analytics/helpers/__init__.py +70 -0
- ingestr/src/google_analytics/helpers/data_processing.py +176 -0
- ingestr/src/sources.py +123 -47
- ingestr/src/sql_database/__init__.py +0 -0
- ingestr/src/sql_database/callbacks.py +66 -0
- ingestr/src/version.py +1 -1
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/METADATA +4 -3
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/RECORD +19 -10
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/WHEEL +0 -0
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/entry_points.txt +0 -0
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Google analytics source helpers"""
|
|
2
|
+
|
|
3
|
+
from typing import Iterator, List
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from apiclient.discovery import Resource # type: ignore
|
|
7
|
+
from dlt.common import logger, pendulum
|
|
8
|
+
from dlt.common.typing import TDataItem
|
|
9
|
+
from google.analytics.data_v1beta.types import (
|
|
10
|
+
Dimension,
|
|
11
|
+
Metric,
|
|
12
|
+
)
|
|
13
|
+
from pendulum.datetime import DateTime
|
|
14
|
+
|
|
15
|
+
from .data_processing import get_report
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def basic_report(
|
|
19
|
+
client: Resource,
|
|
20
|
+
rows_per_page: int,
|
|
21
|
+
dimensions: List[str],
|
|
22
|
+
metrics: List[str],
|
|
23
|
+
property_id: int,
|
|
24
|
+
resource_name: str,
|
|
25
|
+
start_date: str,
|
|
26
|
+
last_date: dlt.sources.incremental[DateTime],
|
|
27
|
+
) -> Iterator[TDataItem]:
|
|
28
|
+
"""
|
|
29
|
+
Retrieves the data for a report given dimensions, metrics, and filters required for the report.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
client: The Google Analytics client used to make requests.
|
|
33
|
+
dimensions: Dimensions for the report. See metadata for the full list of dimensions.
|
|
34
|
+
metrics: Metrics for the report. See metadata for the full list of metrics.
|
|
35
|
+
property_id: A reference to the Google Analytics project.
|
|
36
|
+
More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
|
|
37
|
+
rows_per_page: Controls how many rows are retrieved per page in the reports.
|
|
38
|
+
Default is 10000, maximum possible is 100000.
|
|
39
|
+
resource_name: The resource name used to save incremental into dlt state.
|
|
40
|
+
start_date: Incremental load start_date.
|
|
41
|
+
Default is taken from dlt state if it exists.
|
|
42
|
+
last_date: Incremental load end date.
|
|
43
|
+
Default is taken from dlt state if it exists.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Generator of all rows of data in the report.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
# grab the start time from last dlt load if not filled, if that is also empty then use the first day of the millennium as the start time instead
|
|
50
|
+
if last_date.last_value:
|
|
51
|
+
if start_date != "2015-08-14":
|
|
52
|
+
logger.warning(
|
|
53
|
+
f"Using the starting date: {last_date.last_value} for incremental report: {resource_name} and ignoring start date passed as argument {start_date}"
|
|
54
|
+
)
|
|
55
|
+
start_date = last_date.last_value.to_date_string()
|
|
56
|
+
else:
|
|
57
|
+
start_date = start_date or "2015-08-14"
|
|
58
|
+
|
|
59
|
+
processed_response = get_report(
|
|
60
|
+
client=client,
|
|
61
|
+
property_id=property_id,
|
|
62
|
+
# fill dimensions and metrics with the proper api client objects
|
|
63
|
+
dimension_list=[Dimension(name=dimension) for dimension in dimensions],
|
|
64
|
+
metric_list=[Metric(name=metric) for metric in metrics],
|
|
65
|
+
limit=rows_per_page,
|
|
66
|
+
start_date=start_date,
|
|
67
|
+
# configure end_date to yesterday as a date string
|
|
68
|
+
end_date=pendulum.now().to_date_string(),
|
|
69
|
+
)
|
|
70
|
+
yield from processed_response
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module contains helpers that process data and make it ready for loading into the database
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Iterator, List, Union
|
|
7
|
+
|
|
8
|
+
import proto
|
|
9
|
+
from dlt.common.exceptions import MissingDependencyException
|
|
10
|
+
from dlt.common.pendulum import pendulum
|
|
11
|
+
from dlt.common.typing import DictStrAny, TDataItem, TDataItems
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from google.analytics.data_v1beta import BetaAnalyticsDataClient # noqa: F401
|
|
15
|
+
from google.analytics.data_v1beta.types import (
|
|
16
|
+
DateRange,
|
|
17
|
+
Dimension,
|
|
18
|
+
DimensionExpression, # noqa: F401
|
|
19
|
+
DimensionMetadata, # noqa: F401
|
|
20
|
+
GetMetadataRequest, # noqa: F401
|
|
21
|
+
Metadata, # noqa: F401
|
|
22
|
+
Metric,
|
|
23
|
+
MetricMetadata, # noqa: F401
|
|
24
|
+
MetricType,
|
|
25
|
+
RunReportRequest,
|
|
26
|
+
RunReportResponse,
|
|
27
|
+
)
|
|
28
|
+
except ImportError:
|
|
29
|
+
raise MissingDependencyException(
|
|
30
|
+
"Google Analytics API Client", ["google-analytics-data"]
|
|
31
|
+
)
|
|
32
|
+
try:
|
|
33
|
+
from apiclient.discovery import Resource, build # type: ignore # noqa: F401
|
|
34
|
+
except ImportError:
|
|
35
|
+
raise MissingDependencyException("Google API Client", ["google-api-python-client"])
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def to_dict(item: Any) -> Iterator[TDataItem]:
|
|
39
|
+
"""
|
|
40
|
+
Processes a batch result (page of results per dimension) accordingly
|
|
41
|
+
:param batch:
|
|
42
|
+
:return:
|
|
43
|
+
"""
|
|
44
|
+
item = json.loads(
|
|
45
|
+
proto.Message.to_json(
|
|
46
|
+
item,
|
|
47
|
+
preserving_proto_field_name=True,
|
|
48
|
+
use_integers_for_enums=False,
|
|
49
|
+
including_default_value_fields=False,
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
yield item
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_report(
|
|
56
|
+
client: Resource,
|
|
57
|
+
property_id: int,
|
|
58
|
+
dimension_list: List[Dimension],
|
|
59
|
+
metric_list: List[Metric],
|
|
60
|
+
limit: int,
|
|
61
|
+
start_date: str,
|
|
62
|
+
end_date: str,
|
|
63
|
+
) -> Iterator[TDataItem]:
|
|
64
|
+
"""
|
|
65
|
+
Gets all the possible pages of reports with the given query parameters.
|
|
66
|
+
Processes every page and yields a dictionary for every row of the report.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
client: The Google Analytics client used to make requests.
|
|
70
|
+
property_id: A reference to the Google Analytics project.
|
|
71
|
+
More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
|
|
72
|
+
dimension_list: A list of all the dimensions requested in the query.
|
|
73
|
+
metric_list: A list of all the metrics requested in the query.
|
|
74
|
+
limit: Describes how many rows there should be per page.
|
|
75
|
+
start_date: The starting date of the query.
|
|
76
|
+
end_date: The ending date of the query.
|
|
77
|
+
|
|
78
|
+
Yields:
|
|
79
|
+
Generator of all rows of data in the report.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
request = RunReportRequest(
|
|
83
|
+
property=f"properties/{property_id}",
|
|
84
|
+
dimensions=dimension_list,
|
|
85
|
+
metrics=metric_list,
|
|
86
|
+
limit=limit,
|
|
87
|
+
date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
|
|
88
|
+
)
|
|
89
|
+
# process request
|
|
90
|
+
response = client.run_report(request)
|
|
91
|
+
processed_response_generator = process_report(response=response)
|
|
92
|
+
yield from processed_response_generator
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def process_report(response: RunReportResponse) -> Iterator[TDataItems]:
|
|
96
|
+
"""
|
|
97
|
+
Receives a single page for a report response, processes it, and returns a generator for every row of data in the report page.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
response: The API response for a single page of the report.
|
|
101
|
+
|
|
102
|
+
Yields:
|
|
103
|
+
Generator of dictionaries for every row of the report page.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
metrics_headers = [header.name for header in response.metric_headers]
|
|
107
|
+
dimensions_headers = [header.name for header in response.dimension_headers]
|
|
108
|
+
|
|
109
|
+
distinct_key_combinations = {}
|
|
110
|
+
|
|
111
|
+
for row in response.rows:
|
|
112
|
+
response_dict: DictStrAny = {
|
|
113
|
+
dimension_header: _resolve_dimension_value(
|
|
114
|
+
dimension_header, dimension_value.value
|
|
115
|
+
)
|
|
116
|
+
for dimension_header, dimension_value in zip(
|
|
117
|
+
dimensions_headers, row.dimension_values
|
|
118
|
+
)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
for i in range(len(metrics_headers)):
|
|
122
|
+
# get metric type and process the value depending on type. Save metric name including type as well for the columns
|
|
123
|
+
metric_type = response.metric_headers[i].type_
|
|
124
|
+
metric_value = process_metric_value(
|
|
125
|
+
metric_type=metric_type, value=row.metric_values[i].value
|
|
126
|
+
)
|
|
127
|
+
response_dict[metrics_headers[i]] = metric_value
|
|
128
|
+
|
|
129
|
+
unique_key = "-".join(list(response_dict.keys()))
|
|
130
|
+
if unique_key not in distinct_key_combinations:
|
|
131
|
+
distinct_key_combinations[unique_key] = True
|
|
132
|
+
|
|
133
|
+
yield response_dict
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def process_metric_value(metric_type: MetricType, value: str) -> Union[str, int, float]:
|
|
137
|
+
"""
|
|
138
|
+
Processes the metric type, converts it from string to the correct type, and returns it.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
metric_type: The type of the metric.
|
|
142
|
+
value: The value of the metric as a string.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
The given value converted to the correct data type.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
# So far according to GA4 documentation these are the correct types: https://developers.google.com/analytics/devguides/reporting/data/v1/rest/v1beta/MetricType
|
|
149
|
+
# 0 for strings, 1 for ints and 2-12 are different types of floating points.
|
|
150
|
+
if metric_type.value == 0:
|
|
151
|
+
return value
|
|
152
|
+
elif metric_type.value == 1:
|
|
153
|
+
return int(value)
|
|
154
|
+
else:
|
|
155
|
+
return float(value)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _resolve_dimension_value(dimension_name: str, dimension_value: str) -> Any:
|
|
159
|
+
"""
|
|
160
|
+
Helper function that receives a dimension's name and value and converts it to a datetime object if needed.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
dimension_name: Name of the dimension.
|
|
164
|
+
dimension_value: Value of the dimension.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
The value of the dimension with the correct data type.
|
|
168
|
+
"""
|
|
169
|
+
if dimension_name == "date":
|
|
170
|
+
return pendulum.from_format(dimension_value, "YYYYMMDD", tz="UTC")
|
|
171
|
+
elif dimension_name == "dateHour":
|
|
172
|
+
return pendulum.from_format(dimension_value, "YYYYMMDDHH", tz="UTC")
|
|
173
|
+
elif dimension_name == "dateHourMinute":
|
|
174
|
+
return pendulum.from_format(dimension_value, "YYYYMMDDHHmm", tz="UTC")
|
|
175
|
+
else:
|
|
176
|
+
return dimension_value
|
ingestr/src/sources.py
CHANGED
|
@@ -18,7 +18,6 @@ from urllib.parse import ParseResult, parse_qs, quote, urlparse
|
|
|
18
18
|
|
|
19
19
|
import dlt
|
|
20
20
|
import pendulum
|
|
21
|
-
import sqlalchemy
|
|
22
21
|
from dlt.common.configuration.specs import (
|
|
23
22
|
AwsCredentials,
|
|
24
23
|
)
|
|
@@ -29,7 +28,9 @@ from dlt.common.libs.sql_alchemy import (
|
|
|
29
28
|
from dlt.common.time import ensure_pendulum_datetime
|
|
30
29
|
from dlt.common.typing import TDataItem, TSecretStrValue
|
|
31
30
|
from dlt.extract import Incremental
|
|
32
|
-
from dlt.sources.credentials import
|
|
31
|
+
from dlt.sources.credentials import (
|
|
32
|
+
ConnectionStringCredentials,
|
|
33
|
+
)
|
|
33
34
|
from dlt.sources.sql_database import sql_table
|
|
34
35
|
from dlt.sources.sql_database.helpers import TableLoader
|
|
35
36
|
from dlt.sources.sql_database.schema_types import (
|
|
@@ -40,7 +41,6 @@ from dlt.sources.sql_database.schema_types import (
|
|
|
40
41
|
)
|
|
41
42
|
from sqlalchemy import Column
|
|
42
43
|
from sqlalchemy import types as sa
|
|
43
|
-
from sqlalchemy.dialects import mysql
|
|
44
44
|
|
|
45
45
|
from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
|
|
46
46
|
from ingestr.src.adjust.adjust_helpers import parse_filters
|
|
@@ -53,6 +53,8 @@ from ingestr.src.dynamodb import dynamodb
|
|
|
53
53
|
from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
|
|
54
54
|
from ingestr.src.filesystem import readers
|
|
55
55
|
from ingestr.src.filters import table_adapter_exclude_columns
|
|
56
|
+
from ingestr.src.github import github_reactions, github_repo_events, github_stargazers
|
|
57
|
+
from ingestr.src.google_analytics import google_analytics
|
|
56
58
|
from ingestr.src.google_sheets import google_spreadsheet
|
|
57
59
|
from ingestr.src.gorgias import gorgias_source
|
|
58
60
|
from ingestr.src.hubspot import hubspot
|
|
@@ -63,6 +65,12 @@ from ingestr.src.mongodb import mongodb_collection
|
|
|
63
65
|
from ingestr.src.notion import notion_databases
|
|
64
66
|
from ingestr.src.shopify import shopify_source
|
|
65
67
|
from ingestr.src.slack import slack_source
|
|
68
|
+
from ingestr.src.sql_database.callbacks import (
|
|
69
|
+
chained_query_adapter_callback,
|
|
70
|
+
custom_query_variable_subsitution,
|
|
71
|
+
limit_callback,
|
|
72
|
+
type_adapter_callback,
|
|
73
|
+
)
|
|
66
74
|
from ingestr.src.stripe_analytics import stripe_source
|
|
67
75
|
from ingestr.src.table_definition import TableDefinition, table_string_to_dataclass
|
|
68
76
|
from ingestr.src.tiktok_ads import tiktok_source
|
|
@@ -95,27 +103,22 @@ class SqlSource:
|
|
|
95
103
|
if kwargs.get("incremental_key"):
|
|
96
104
|
start_value = kwargs.get("interval_start")
|
|
97
105
|
end_value = kwargs.get("interval_end")
|
|
98
|
-
|
|
99
106
|
incremental = dlt.sources.incremental(
|
|
100
107
|
kwargs.get("incremental_key", ""),
|
|
101
|
-
# primary_key=(),
|
|
102
108
|
initial_value=start_value,
|
|
103
109
|
end_value=end_value,
|
|
110
|
+
range_end="closed",
|
|
111
|
+
range_start="closed",
|
|
104
112
|
)
|
|
105
113
|
|
|
106
114
|
if uri.startswith("mysql://"):
|
|
107
115
|
uri = uri.replace("mysql://", "mysql+pymysql://")
|
|
108
116
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
query_adapter_callback = None
|
|
117
|
+
query_adapters = []
|
|
112
118
|
if kwargs.get("sql_limit"):
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
if kwargs.get("incremental_key"):
|
|
117
|
-
query = query.order_by(kwargs.get("incremental_key"))
|
|
118
|
-
return query
|
|
119
|
+
query_adapters.append(
|
|
120
|
+
limit_callback(kwargs.get("sql_limit"), kwargs.get("incremental_key"))
|
|
121
|
+
)
|
|
119
122
|
|
|
120
123
|
defer_table_reflect = False
|
|
121
124
|
sql_backend = kwargs.get("sql_backend", "sqlalchemy")
|
|
@@ -158,6 +161,7 @@ class SqlSource:
|
|
|
158
161
|
switchDict = {
|
|
159
162
|
int: sa.INTEGER,
|
|
160
163
|
datetime: sa.TIMESTAMP,
|
|
164
|
+
date: sa.DATE,
|
|
161
165
|
pendulum.Date: sa.DATE,
|
|
162
166
|
pendulum.DateTime: sa.TIMESTAMP,
|
|
163
167
|
}
|
|
@@ -193,38 +197,10 @@ class SqlSource:
|
|
|
193
197
|
if getattr(engine, "may_dispose_after_use", False):
|
|
194
198
|
engine.dispose()
|
|
195
199
|
|
|
196
|
-
dlt.sources.sql_database.table_rows = table_rows
|
|
197
|
-
|
|
198
|
-
def query_adapter_callback(query, table, incremental=None, engine=None):
|
|
199
|
-
params = {}
|
|
200
|
-
if incremental:
|
|
201
|
-
params["interval_start"] = (
|
|
202
|
-
incremental.last_value
|
|
203
|
-
if incremental.last_value is not None
|
|
204
|
-
else datetime(year=1, month=1, day=1)
|
|
205
|
-
)
|
|
206
|
-
if incremental.end_value is not None:
|
|
207
|
-
params["interval_end"] = incremental.end_value
|
|
208
|
-
else:
|
|
209
|
-
if ":interval_start" in query_value:
|
|
210
|
-
params["interval_start"] = (
|
|
211
|
-
datetime.min
|
|
212
|
-
if kwargs.get("interval_start") is None
|
|
213
|
-
else kwargs.get("interval_start")
|
|
214
|
-
)
|
|
215
|
-
if ":interval_end" in query_value:
|
|
216
|
-
params["interval_end"] = (
|
|
217
|
-
datetime.max
|
|
218
|
-
if kwargs.get("interval_end") is None
|
|
219
|
-
else kwargs.get("interval_end")
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
return sqlalchemy.text(query_value).bindparams(**params)
|
|
200
|
+
dlt.sources.sql_database.table_rows = table_rows # type: ignore
|
|
223
201
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
return sa.JSON
|
|
227
|
-
return sql_type
|
|
202
|
+
# override the query adapters, the only one we want is the one here in the case of custom queries
|
|
203
|
+
query_adapters = [custom_query_variable_subsitution(query_value, kwargs)]
|
|
228
204
|
|
|
229
205
|
builder_res = self.table_builder(
|
|
230
206
|
credentials=ConnectionStringCredentials(uri),
|
|
@@ -233,8 +209,8 @@ class SqlSource:
|
|
|
233
209
|
incremental=incremental,
|
|
234
210
|
backend=sql_backend,
|
|
235
211
|
chunk_size=kwargs.get("page_size", None),
|
|
236
|
-
reflection_level=
|
|
237
|
-
query_adapter_callback=
|
|
212
|
+
reflection_level=kwargs.get("sql_reflection_level", None),
|
|
213
|
+
query_adapter_callback=chained_query_adapter_callback(query_adapters),
|
|
238
214
|
type_adapter_callback=type_adapter_callback,
|
|
239
215
|
table_adapter_callback=table_adapter_exclude_columns(
|
|
240
216
|
kwargs.get("sql_exclude_columns", [])
|
|
@@ -1338,3 +1314,103 @@ class DynamoDBSource:
|
|
|
1338
1314
|
)
|
|
1339
1315
|
|
|
1340
1316
|
return dynamodb(table, creds, incremental)
|
|
1317
|
+
|
|
1318
|
+
|
|
1319
|
+
class GoogleAnalyticsSource:
|
|
1320
|
+
def handles_incrementality(self) -> bool:
|
|
1321
|
+
return True
|
|
1322
|
+
|
|
1323
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1324
|
+
parse_uri = urlparse(uri)
|
|
1325
|
+
source_fields = parse_qs(parse_uri.query)
|
|
1326
|
+
cred_path = source_fields.get("credentials_path")
|
|
1327
|
+
|
|
1328
|
+
if not cred_path:
|
|
1329
|
+
raise ValueError("credentials_path is required to connect Google Analytics")
|
|
1330
|
+
credentials = {}
|
|
1331
|
+
|
|
1332
|
+
with open(cred_path[0], "r") as f:
|
|
1333
|
+
credentials = json.load(f)
|
|
1334
|
+
|
|
1335
|
+
property_id = source_fields.get("property_id")
|
|
1336
|
+
if not property_id:
|
|
1337
|
+
raise ValueError("property_id is required to connect to Google Analytics")
|
|
1338
|
+
|
|
1339
|
+
interval_start = kwargs.get("interval_start")
|
|
1340
|
+
start_date = (
|
|
1341
|
+
interval_start.strftime("%Y-%m-%d") if interval_start else "2015-08-14"
|
|
1342
|
+
)
|
|
1343
|
+
|
|
1344
|
+
fields = table.split(":")
|
|
1345
|
+
if len(fields) != 3:
|
|
1346
|
+
raise ValueError(
|
|
1347
|
+
"Invalid table format. Expected format: custom:<dimensions>:<metrics>"
|
|
1348
|
+
)
|
|
1349
|
+
|
|
1350
|
+
dimensions = fields[1].replace(" ", "").split(",")
|
|
1351
|
+
|
|
1352
|
+
datetime = ""
|
|
1353
|
+
for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
|
|
1354
|
+
if dimension_datetime in dimensions:
|
|
1355
|
+
datetime = dimension_datetime
|
|
1356
|
+
break
|
|
1357
|
+
else:
|
|
1358
|
+
raise ValueError(
|
|
1359
|
+
"You must provide at least one dimension: [dateHour, dateHourMinute, date]"
|
|
1360
|
+
)
|
|
1361
|
+
|
|
1362
|
+
metrics = fields[2].replace(" ", "").split(",")
|
|
1363
|
+
queries = [
|
|
1364
|
+
{"resource_name": "custom", "dimensions": dimensions, "metrics": metrics}
|
|
1365
|
+
]
|
|
1366
|
+
|
|
1367
|
+
return google_analytics(
|
|
1368
|
+
property_id=property_id[0],
|
|
1369
|
+
start_date=start_date,
|
|
1370
|
+
datetime=datetime,
|
|
1371
|
+
queries=queries,
|
|
1372
|
+
credentials=credentials,
|
|
1373
|
+
).with_resources("basic_report")
|
|
1374
|
+
|
|
1375
|
+
|
|
1376
|
+
class GitHubSource:
|
|
1377
|
+
def handles_incrementality(self) -> bool:
|
|
1378
|
+
return True
|
|
1379
|
+
|
|
1380
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1381
|
+
if kwargs.get("incremental_key"):
|
|
1382
|
+
raise ValueError(
|
|
1383
|
+
"Github takes care of incrementality on its own, you should not provide incremental_key"
|
|
1384
|
+
)
|
|
1385
|
+
# github://?access_token=<access_token>&owner=<owner>&repo=<repo>
|
|
1386
|
+
parsed_uri = urlparse(uri)
|
|
1387
|
+
source_fields = parse_qs(parsed_uri.query)
|
|
1388
|
+
|
|
1389
|
+
owner = source_fields.get("owner", [None])[0]
|
|
1390
|
+
if not owner:
|
|
1391
|
+
raise ValueError(
|
|
1392
|
+
"owner of the repository is required to connect with GitHub"
|
|
1393
|
+
)
|
|
1394
|
+
|
|
1395
|
+
repo = source_fields.get("repo", [None])[0]
|
|
1396
|
+
if not repo:
|
|
1397
|
+
raise ValueError(
|
|
1398
|
+
"repo variable is required to retrieve data for a specific repository from GitHub."
|
|
1399
|
+
)
|
|
1400
|
+
|
|
1401
|
+
access_token = source_fields.get("access_token", [None])[0]
|
|
1402
|
+
if not access_token and table not in ["repo_events"]:
|
|
1403
|
+
raise ValueError("access_token is required to connect with GitHub")
|
|
1404
|
+
|
|
1405
|
+
if table in ["issues", "pull_requests"]:
|
|
1406
|
+
return github_reactions(
|
|
1407
|
+
owner=owner, name=repo, access_token=access_token
|
|
1408
|
+
).with_resources(table)
|
|
1409
|
+
elif table == "repo_events":
|
|
1410
|
+
return github_repo_events(owner=owner, name=repo, access_token=access_token)
|
|
1411
|
+
elif table == "stargazers":
|
|
1412
|
+
return github_stargazers(owner=owner, name=repo, access_token=access_token)
|
|
1413
|
+
else:
|
|
1414
|
+
raise ValueError(
|
|
1415
|
+
f"Resource '{table}' is not supported for GitHub source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1416
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import text
|
|
4
|
+
from sqlalchemy import types as sa
|
|
5
|
+
from sqlalchemy.dialects import mysql
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def type_adapter_callback(sql_type):
|
|
9
|
+
if isinstance(sql_type, mysql.SET):
|
|
10
|
+
return sa.JSON
|
|
11
|
+
return sql_type
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def chained_query_adapter_callback(query_adapters):
|
|
15
|
+
"""
|
|
16
|
+
This function is used to chain multiple query adapters together,.
|
|
17
|
+
This gives us the flexibility to introduce various adapters based on the given command parameters.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def callback(query, table):
|
|
21
|
+
for adapter in query_adapters:
|
|
22
|
+
query = adapter(query, table)
|
|
23
|
+
|
|
24
|
+
return query
|
|
25
|
+
|
|
26
|
+
return callback
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def limit_callback(sql_limit: int, incremental_key: str):
|
|
30
|
+
def callback(query, table):
|
|
31
|
+
query = query.limit(sql_limit)
|
|
32
|
+
if incremental_key:
|
|
33
|
+
query = query.order_by(incremental_key)
|
|
34
|
+
return query
|
|
35
|
+
|
|
36
|
+
return callback
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def custom_query_variable_subsitution(query_value: str, kwargs: dict):
|
|
40
|
+
def callback(query, table, incremental=None, engine=None):
|
|
41
|
+
params = {}
|
|
42
|
+
if incremental:
|
|
43
|
+
params["interval_start"] = (
|
|
44
|
+
incremental.last_value
|
|
45
|
+
if incremental.last_value is not None
|
|
46
|
+
else datetime(year=1, month=1, day=1)
|
|
47
|
+
)
|
|
48
|
+
if incremental.end_value is not None:
|
|
49
|
+
params["interval_end"] = incremental.end_value
|
|
50
|
+
else:
|
|
51
|
+
if ":interval_start" in query_value:
|
|
52
|
+
params["interval_start"] = (
|
|
53
|
+
datetime.min
|
|
54
|
+
if kwargs.get("interval_start") is None
|
|
55
|
+
else kwargs.get("interval_start")
|
|
56
|
+
)
|
|
57
|
+
if ":interval_end" in query_value:
|
|
58
|
+
params["interval_end"] = (
|
|
59
|
+
datetime.max
|
|
60
|
+
if kwargs.get("interval_end") is None
|
|
61
|
+
else kwargs.get("interval_end")
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return text(query_value).bindparams(**params)
|
|
65
|
+
|
|
66
|
+
return callback
|
ingestr/src/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.12.
|
|
1
|
+
__version__ = "0.12.5"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.5
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -17,10 +17,11 @@ Requires-Python: >=3.9
|
|
|
17
17
|
Requires-Dist: asana==3.2.3
|
|
18
18
|
Requires-Dist: confluent-kafka>=2.6.1
|
|
19
19
|
Requires-Dist: databricks-sql-connector==2.9.3
|
|
20
|
-
Requires-Dist: dlt==1.
|
|
20
|
+
Requires-Dist: dlt==1.5.0
|
|
21
21
|
Requires-Dist: duckdb-engine==0.13.5
|
|
22
22
|
Requires-Dist: duckdb==1.1.3
|
|
23
23
|
Requires-Dist: facebook-business==20.0.0
|
|
24
|
+
Requires-Dist: google-analytics-data==0.18.16
|
|
24
25
|
Requires-Dist: google-api-python-client==2.130.0
|
|
25
26
|
Requires-Dist: google-cloud-bigquery-storage==2.24.0
|
|
26
27
|
Requires-Dist: mysql-connector-python==9.1.0
|
|
@@ -33,7 +34,7 @@ Requires-Dist: pyathena==3.9.0
|
|
|
33
34
|
Requires-Dist: pymongo==4.10.1
|
|
34
35
|
Requires-Dist: pymysql==1.1.1
|
|
35
36
|
Requires-Dist: pyrate-limiter==3.7.0
|
|
36
|
-
Requires-Dist: redshift-connector==2.1.
|
|
37
|
+
Requires-Dist: redshift-connector==2.1.5
|
|
37
38
|
Requires-Dist: rich==13.9.4
|
|
38
39
|
Requires-Dist: rudder-sdk-python==2.1.4
|
|
39
40
|
Requires-Dist: s3fs==2024.10.0
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
ingestr/main.py,sha256=
|
|
1
|
+
ingestr/main.py,sha256=xLQiPHoD7dNvrHfNTwD379wHg6xZGmLxzPzQLq2E1RA,24746
|
|
2
2
|
ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
|
|
3
3
|
ingestr/src/destinations.py,sha256=zcHJIIHAZmcD9sJomd6G1Bc-1KsxnBD2aByOSV_9L3g,8850
|
|
4
|
-
ingestr/src/factory.py,sha256=
|
|
4
|
+
ingestr/src/factory.py,sha256=aE7TjHzONb4DKYcfh_6-CJJfvs4lmw7iUySvSm4yQbM,4516
|
|
5
5
|
ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
|
|
6
|
-
ingestr/src/sources.py,sha256=
|
|
6
|
+
ingestr/src/sources.py,sha256=FXUTmII3DiEANZN37P9-dTFFRzpv0PL8bfaQvr0un8w,50761
|
|
7
7
|
ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
|
|
8
8
|
ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
|
|
9
|
-
ingestr/src/version.py,sha256=
|
|
9
|
+
ingestr/src/version.py,sha256=QFQfu3CUVe9Ncr8kv3aaBY3oWrZmv8xboen_Uwy7eXU,23
|
|
10
10
|
ingestr/src/adjust/__init__.py,sha256=NaRNwDhItG8Q7vUHw7zQvyfWjmT32M0CSc5ufjmBM9U,3067
|
|
11
11
|
ingestr/src/adjust/adjust_helpers.py,sha256=-tmmxy9k3wms-ZEIgxmlp2cAQ2X_O1lgjY1128bbMu4,3224
|
|
12
12
|
ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
|
|
13
13
|
ingestr/src/appsflyer/_init_.py,sha256=ne2-9FQ654Drtd3GkKQv8Bwb6LEqCnJw49MfO5Jyzgs,739
|
|
14
14
|
ingestr/src/appsflyer/client.py,sha256=TNmwakLzmO6DZW3wcfLfQRl7aNBHgFqSsk4ef-MmJ1w,3084
|
|
15
|
-
ingestr/src/arrow/__init__.py,sha256=
|
|
15
|
+
ingestr/src/arrow/__init__.py,sha256=8fEntgHseKjFMiPQIzxYzw_raicNsEgnveLi1IzBca0,2848
|
|
16
16
|
ingestr/src/asana_source/__init__.py,sha256=Y4Ti_876Yong420fQ2o4A97TdgrZNlZVxlTMLyXdSjA,8116
|
|
17
17
|
ingestr/src/asana_source/helpers.py,sha256=PukcdDQWIGqnGxuuobbLw4hUy4-t6gxXg_XywR7Lg9M,375
|
|
18
18
|
ingestr/src/asana_source/settings.py,sha256=-2tpdkwh04RvLKFvwQodnFLYn9MaxOO1hsebGnDQMTU,2829
|
|
@@ -27,6 +27,13 @@ ingestr/src/facebook_ads/settings.py,sha256=1IxZeP_4rN3IBvAncNHOoqpzAirx0Hz-MUK_
|
|
|
27
27
|
ingestr/src/filesystem/__init__.py,sha256=wHHaKFuAjsR_ZRjl6g_Flf6FhVs9qhwREthTr03_7cc,4162
|
|
28
28
|
ingestr/src/filesystem/helpers.py,sha256=bg0muSHZr3hMa8H4jN2-LGWzI-SUoKlQNiWJ74-YYms,3211
|
|
29
29
|
ingestr/src/filesystem/readers.py,sha256=a0fKkaRpnAOGsXI3EBNYZa7x6tlmAOsgRzb883StY30,3987
|
|
30
|
+
ingestr/src/github/__init__.py,sha256=csA2VcjOxXrVrvp7zY-JodO9Lpy98bJ4AqRdHCLTcGM,5838
|
|
31
|
+
ingestr/src/github/helpers.py,sha256=Tmnik9811zBWNO6cJwV9PFQxEx2j32LHAQCvNbubsEI,6759
|
|
32
|
+
ingestr/src/github/queries.py,sha256=W34C02jUEdjFmOE7f7u9xvYyBNDMfVZAu0JIRZI2mkU,2302
|
|
33
|
+
ingestr/src/github/settings.py,sha256=N5ahWrDIQ_4IWV9i-hTXxyYduqY9Ym2BTwqsWxcDdJ8,258
|
|
34
|
+
ingestr/src/google_analytics/__init__.py,sha256=HjA13wfJm2MGfy3h_DiM5ekkNqM2dgwYCKJ3pprnDtI,2482
|
|
35
|
+
ingestr/src/google_analytics/helpers/__init__.py,sha256=y_q7dinlEwNBEpq6kCzjTa8lAhe2bb23bDPP0fcy7fY,2744
|
|
36
|
+
ingestr/src/google_analytics/helpers/data_processing.py,sha256=fIdEKr9CmZN_s1T2i9BL8IYTPPqNoK6Vaquq2y8StfE,6072
|
|
30
37
|
ingestr/src/google_sheets/README.md,sha256=wFQhvmGpRA38Ba2N_WIax6duyD4c7c_pwvvprRfQDnw,5470
|
|
31
38
|
ingestr/src/google_sheets/__init__.py,sha256=5qlX-6ilx5MW7klC7B_0jGSxloQSLkSESTh4nlY3Aos,6643
|
|
32
39
|
ingestr/src/google_sheets/helpers/__init__.py,sha256=5hXZrZK8cMO3UOuL-s4OKOpdACdihQD0hYYlSEu-iQ8,35
|
|
@@ -56,6 +63,8 @@ ingestr/src/shopify/settings.py,sha256=StY0EPr7wFJ7KzRRDN4TKxV0_gkIS1wPj2eR4AYSs
|
|
|
56
63
|
ingestr/src/slack/__init__.py,sha256=UF-ficQ6K32u1EHytW3P35suACo9wuc6nMrAPViyZL8,9981
|
|
57
64
|
ingestr/src/slack/helpers.py,sha256=08TLK7vhFvH_uekdLVOLF3bTDe1zgH0QxHObXHzk1a8,6545
|
|
58
65
|
ingestr/src/slack/settings.py,sha256=NhKn4y1zokEa5EmIZ05wtj_-I0GOASXZ5V81M1zXCtY,457
|
|
66
|
+
ingestr/src/sql_database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
|
+
ingestr/src/sql_database/callbacks.py,sha256=sEFFmXxAURY3yeBjnawigDtq9LBCvi8HFqG4kLd7tMU,2002
|
|
59
68
|
ingestr/src/stripe_analytics/__init__.py,sha256=VEXH4to2vNojN4rk3qsypR7urtTzaxSBB3IBiD5tuoE,4514
|
|
60
69
|
ingestr/src/stripe_analytics/helpers.py,sha256=iqZOyiGIOhOAhVXXU16DP0hkkTKcTrDu69vAJoTxgEo,1976
|
|
61
70
|
ingestr/src/stripe_analytics/settings.py,sha256=rl9L5XumxO0pjkZf7MGesXHp4QLRgnz3RWLuDWDBKXo,380
|
|
@@ -77,8 +86,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
|
|
|
77
86
|
ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
|
|
78
87
|
ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
|
|
79
88
|
ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
|
|
80
|
-
ingestr-0.12.
|
|
81
|
-
ingestr-0.12.
|
|
82
|
-
ingestr-0.12.
|
|
83
|
-
ingestr-0.12.
|
|
84
|
-
ingestr-0.12.
|
|
89
|
+
ingestr-0.12.5.dist-info/METADATA,sha256=QhFy0K3FUgK2VGdShWUOeTj_HbHElVPD64bAf2k-4G0,7956
|
|
90
|
+
ingestr-0.12.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
91
|
+
ingestr-0.12.5.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
|
|
92
|
+
ingestr-0.12.5.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
|
|
93
|
+
ingestr-0.12.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|