ingestr 0.10.4__py3-none-any.whl → 0.12.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/src/asana_source/__init__.py +264 -0
- ingestr/src/asana_source/helpers.py +16 -0
- ingestr/src/asana_source/settings.py +144 -0
- ingestr/src/dynamodb/__init__.py +86 -0
- ingestr/src/factory.py +48 -58
- ingestr/src/sources.py +181 -4
- ingestr/src/tiktok_ads/__init__.py +106 -0
- ingestr/src/tiktok_ads/tiktok_helpers.py +112 -0
- ingestr/src/time.py +11 -0
- ingestr/src/version.py +1 -1
- ingestr/src/zendesk/__init__.py +1 -0
- {ingestr-0.10.4.dist-info → ingestr-0.12.2.dist-info}/METADATA +17 -5
- {ingestr-0.10.4.dist-info → ingestr-0.12.2.dist-info}/RECORD +16 -9
- {ingestr-0.10.4.dist-info → ingestr-0.12.2.dist-info}/WHEEL +1 -1
- {ingestr-0.10.4.dist-info → ingestr-0.12.2.dist-info}/entry_points.txt +0 -0
- {ingestr-0.10.4.dist-info → ingestr-0.12.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This source provides data extraction from the Asana platform via their API.
|
|
3
|
+
|
|
4
|
+
It defines several functions to fetch data from different parts of Asana including
|
|
5
|
+
workspaces, projects, sections, tags, tasks, stories, teams, and users. These
|
|
6
|
+
functions are meant to be used as part of a data loading pipeline.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import typing as t
|
|
10
|
+
from typing import Any, Iterable
|
|
11
|
+
|
|
12
|
+
import dlt
|
|
13
|
+
from dlt.common.typing import TDataItem
|
|
14
|
+
|
|
15
|
+
from .helpers import get_client
|
|
16
|
+
from .settings import (
|
|
17
|
+
DEFAULT_START_DATE,
|
|
18
|
+
PROJECT_FIELDS,
|
|
19
|
+
REQUEST_TIMEOUT,
|
|
20
|
+
SECTION_FIELDS,
|
|
21
|
+
STORY_FIELDS,
|
|
22
|
+
TAG_FIELDS,
|
|
23
|
+
TASK_FIELDS,
|
|
24
|
+
TEAMS_FIELD,
|
|
25
|
+
USER_FIELDS,
|
|
26
|
+
WORKSPACE_FIELDS,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dlt.source
|
|
31
|
+
def asana_source() -> Any: # should be Sequence[DltResource]:
|
|
32
|
+
"""
|
|
33
|
+
The main function that runs all the other functions to fetch data from Asana.
|
|
34
|
+
Returns:
|
|
35
|
+
Sequence[DltResource]: A sequence of DltResource objects containing the fetched data.
|
|
36
|
+
"""
|
|
37
|
+
return [
|
|
38
|
+
workspaces,
|
|
39
|
+
projects,
|
|
40
|
+
sections,
|
|
41
|
+
tags,
|
|
42
|
+
tasks,
|
|
43
|
+
stories,
|
|
44
|
+
teams,
|
|
45
|
+
users,
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dlt.resource(write_disposition="replace")
|
|
50
|
+
def workspaces(
|
|
51
|
+
access_token: str = dlt.secrets.value, fields: Iterable[str] = WORKSPACE_FIELDS
|
|
52
|
+
) -> Iterable[TDataItem]:
|
|
53
|
+
"""
|
|
54
|
+
Fetches and returns a list of workspaces from Asana.
|
|
55
|
+
Args:
|
|
56
|
+
access_token (str): The access token to authenticate the Asana API client, provided in the secrets file
|
|
57
|
+
fields (Iterable[str]): The list of workspace fields to be retrieved from Asana API.
|
|
58
|
+
Yields:
|
|
59
|
+
dict: The workspace data.
|
|
60
|
+
"""
|
|
61
|
+
yield from get_client(access_token).workspaces.find_all(opt_fields=",".join(fields))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dlt.transformer(
|
|
65
|
+
data_from=workspaces,
|
|
66
|
+
write_disposition="replace",
|
|
67
|
+
)
|
|
68
|
+
@dlt.defer
|
|
69
|
+
def projects(
|
|
70
|
+
workspace: TDataItem,
|
|
71
|
+
access_token: str = dlt.secrets.value,
|
|
72
|
+
fields: Iterable[str] = PROJECT_FIELDS,
|
|
73
|
+
) -> Iterable[TDataItem]:
|
|
74
|
+
"""
|
|
75
|
+
Fetches and returns a list of projects for a given workspace from Asana.
|
|
76
|
+
Args:
|
|
77
|
+
workspace (dict): The workspace data.
|
|
78
|
+
access_token (str): The access token to authenticate the Asana API client, provided in the secrets file
|
|
79
|
+
fields (Iterable[str]): The list of workspace fields to be retrieved from Asana API.
|
|
80
|
+
Returns:
|
|
81
|
+
list[dict]: The project data for the given workspace.
|
|
82
|
+
"""
|
|
83
|
+
return list(
|
|
84
|
+
get_client(access_token).projects.find_all(
|
|
85
|
+
workspace=workspace["gid"],
|
|
86
|
+
timeout=REQUEST_TIMEOUT,
|
|
87
|
+
opt_fields=",".join(fields),
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dlt.transformer(
|
|
93
|
+
data_from=projects,
|
|
94
|
+
write_disposition="replace",
|
|
95
|
+
)
|
|
96
|
+
@dlt.defer
|
|
97
|
+
def sections(
|
|
98
|
+
project_array: t.List[TDataItem],
|
|
99
|
+
access_token: str = dlt.secrets.value,
|
|
100
|
+
fields: Iterable[str] = SECTION_FIELDS,
|
|
101
|
+
) -> Iterable[TDataItem]:
|
|
102
|
+
"""
|
|
103
|
+
Fetches all sections for a given project from Asana.
|
|
104
|
+
Args:
|
|
105
|
+
project_array (list): The project data.
|
|
106
|
+
access_token (str): The access token to authenticate the Asana API client, provided in the secrets file
|
|
107
|
+
fields (Iterable[str]): The list of workspace fields to be retrieved from Asana API.
|
|
108
|
+
Returns:
|
|
109
|
+
list[dict]: The sections data for the given project.
|
|
110
|
+
"""
|
|
111
|
+
return [
|
|
112
|
+
section
|
|
113
|
+
for project in project_array
|
|
114
|
+
for section in get_client(access_token).sections.get_sections_for_project(
|
|
115
|
+
project_gid=project["gid"],
|
|
116
|
+
timeout=REQUEST_TIMEOUT,
|
|
117
|
+
opt_fields=",".join(fields),
|
|
118
|
+
)
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dlt.transformer(data_from=workspaces, write_disposition="replace")
|
|
123
|
+
@dlt.defer
|
|
124
|
+
def tags(
|
|
125
|
+
workspace: TDataItem,
|
|
126
|
+
access_token: str = dlt.secrets.value,
|
|
127
|
+
fields: Iterable[str] = TAG_FIELDS,
|
|
128
|
+
) -> Iterable[TDataItem]:
|
|
129
|
+
"""
|
|
130
|
+
Fetches all tags for a given workspace from Asana.
|
|
131
|
+
Args:
|
|
132
|
+
workspace (dict): The workspace data.
|
|
133
|
+
access_token (str): The access token to authenticate the Asana API client, provided in the secrets file
|
|
134
|
+
fields (Iterable[str]): The list of workspace fields to be retrieved from Asana API.
|
|
135
|
+
Returns:
|
|
136
|
+
list[dict]: The tags data for the given workspace.
|
|
137
|
+
"""
|
|
138
|
+
return [
|
|
139
|
+
tag
|
|
140
|
+
for tag in get_client(access_token).tags.find_all(
|
|
141
|
+
workspace=workspace["gid"],
|
|
142
|
+
timeout=REQUEST_TIMEOUT,
|
|
143
|
+
opt_fields=",".join(fields),
|
|
144
|
+
)
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dlt.transformer(data_from=projects, write_disposition="merge", primary_key="gid")
|
|
149
|
+
def tasks(
|
|
150
|
+
project_array: t.List[TDataItem],
|
|
151
|
+
access_token: str = dlt.secrets.value,
|
|
152
|
+
modified_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
153
|
+
"modified_at", initial_value=DEFAULT_START_DATE
|
|
154
|
+
),
|
|
155
|
+
fields: Iterable[str] = TASK_FIELDS,
|
|
156
|
+
) -> Iterable[TDataItem]:
|
|
157
|
+
"""
|
|
158
|
+
Fetches all tasks for a given project from Asana.
|
|
159
|
+
Args:
|
|
160
|
+
project_array (list): The project data.
|
|
161
|
+
access_token (str): The access token to authenticate the Asana API client, provided in the secrets file
|
|
162
|
+
|
|
163
|
+
modified_at (str): The date from which to fetch modified tasks.
|
|
164
|
+
fields (Iterable[str]): The list of workspace fields to be retrieved from Asana API.
|
|
165
|
+
Yields:
|
|
166
|
+
dict: The task data for the given project.
|
|
167
|
+
"""
|
|
168
|
+
yield from (
|
|
169
|
+
task
|
|
170
|
+
for project in project_array
|
|
171
|
+
for task in get_client(access_token).tasks.find_all(
|
|
172
|
+
project=project["gid"],
|
|
173
|
+
timeout=REQUEST_TIMEOUT,
|
|
174
|
+
modified_since=modified_at.start_value,
|
|
175
|
+
opt_fields=",".join(fields),
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@dlt.transformer(
|
|
181
|
+
data_from=tasks,
|
|
182
|
+
write_disposition="append",
|
|
183
|
+
)
|
|
184
|
+
@dlt.defer
|
|
185
|
+
def stories(
|
|
186
|
+
task: TDataItem,
|
|
187
|
+
access_token: str = dlt.secrets.value,
|
|
188
|
+
fields: Iterable[str] = STORY_FIELDS,
|
|
189
|
+
) -> Iterable[TDataItem]:
|
|
190
|
+
"""
|
|
191
|
+
Fetches stories for a task from Asana.
|
|
192
|
+
Args:
|
|
193
|
+
task (dict): The task data.
|
|
194
|
+
access_token (str): The access token to authenticate the Asana API client, provided in the secrets file
|
|
195
|
+
fields (Iterable[str]): The list of workspace fields to be retrieved from Asana API.
|
|
196
|
+
Returns:
|
|
197
|
+
list[dict]: The stories data for the given task.
|
|
198
|
+
"""
|
|
199
|
+
return [
|
|
200
|
+
story
|
|
201
|
+
for story in get_client(access_token).stories.get_stories_for_task(
|
|
202
|
+
task_gid=task["gid"],
|
|
203
|
+
timeout=REQUEST_TIMEOUT,
|
|
204
|
+
opt_fields=",".join(fields),
|
|
205
|
+
)
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@dlt.transformer(
|
|
210
|
+
data_from=workspaces,
|
|
211
|
+
write_disposition="replace",
|
|
212
|
+
)
|
|
213
|
+
@dlt.defer
|
|
214
|
+
def teams(
|
|
215
|
+
workspace: TDataItem,
|
|
216
|
+
access_token: str = dlt.secrets.value,
|
|
217
|
+
fields: Iterable[str] = TEAMS_FIELD,
|
|
218
|
+
) -> Iterable[TDataItem]:
|
|
219
|
+
"""
|
|
220
|
+
Fetches all teams for a given workspace from Asana.
|
|
221
|
+
Args:
|
|
222
|
+
workspace (dict): The workspace data.
|
|
223
|
+
access_token (str): The access token to authenticate the Asana API client, provided in the secrets file
|
|
224
|
+
fields (Iterable[str]): The list of workspace fields to be retrieved from Asana API.
|
|
225
|
+
Returns:
|
|
226
|
+
list[dict]: The teams data for the given workspace.
|
|
227
|
+
"""
|
|
228
|
+
return [
|
|
229
|
+
team
|
|
230
|
+
for team in get_client(access_token).teams.find_by_organization(
|
|
231
|
+
organization=workspace["gid"],
|
|
232
|
+
timeout=REQUEST_TIMEOUT,
|
|
233
|
+
opt_fields=",".join(fields),
|
|
234
|
+
)
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@dlt.transformer(
|
|
239
|
+
data_from=workspaces,
|
|
240
|
+
write_disposition="replace",
|
|
241
|
+
)
|
|
242
|
+
@dlt.defer
|
|
243
|
+
def users(
|
|
244
|
+
workspace: TDataItem,
|
|
245
|
+
access_token: str = dlt.secrets.value,
|
|
246
|
+
fields: Iterable[str] = USER_FIELDS,
|
|
247
|
+
) -> Iterable[TDataItem]:
|
|
248
|
+
"""
|
|
249
|
+
Fetches all users for a given workspace from Asana.
|
|
250
|
+
Args:
|
|
251
|
+
workspace (dict): The workspace data.
|
|
252
|
+
access_token (str): The access token to authenticate the Asana API client, provided in the secrets file
|
|
253
|
+
fields (Iterable[str]): The list of workspace fields to be retrieved from Asana API.
|
|
254
|
+
Returns:
|
|
255
|
+
list[dict]: The user data for the given workspace.
|
|
256
|
+
"""
|
|
257
|
+
return [
|
|
258
|
+
user
|
|
259
|
+
for user in get_client(access_token).users.find_all(
|
|
260
|
+
workspace=workspace["gid"],
|
|
261
|
+
timeout=REQUEST_TIMEOUT,
|
|
262
|
+
opt_fields=",".join(fields),
|
|
263
|
+
)
|
|
264
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Asana source helpers"""
|
|
2
|
+
|
|
3
|
+
from asana import Client as AsanaClient
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_client(
|
|
7
|
+
access_token: str,
|
|
8
|
+
) -> AsanaClient:
|
|
9
|
+
"""
|
|
10
|
+
Returns an Asana API client.
|
|
11
|
+
Args:
|
|
12
|
+
access_token (str): The access token to authenticate the Asana API client.
|
|
13
|
+
Returns:
|
|
14
|
+
AsanaClient: The Asana API client.
|
|
15
|
+
"""
|
|
16
|
+
return AsanaClient.access_token(access_token)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Asana source settings and constants"""
|
|
2
|
+
|
|
3
|
+
# Default start date for Asana API requests, only tasks started after this date will be collected
|
|
4
|
+
DEFAULT_START_DATE = "2010-01-01T00:00:00.000Z"
|
|
5
|
+
|
|
6
|
+
# Asana API request timeout
|
|
7
|
+
REQUEST_TIMEOUT = 300
|
|
8
|
+
|
|
9
|
+
# list of workspace fields to be retrieved from Asana API
|
|
10
|
+
WORKSPACE_FIELDS = ("gid", "name", "is_organization", "resource_type", "email_domains")
|
|
11
|
+
|
|
12
|
+
# List of project fields to be retrieved from Asana API
|
|
13
|
+
PROJECT_FIELDS = (
|
|
14
|
+
"name",
|
|
15
|
+
"gid",
|
|
16
|
+
"owner",
|
|
17
|
+
"current_status",
|
|
18
|
+
"custom_fields",
|
|
19
|
+
"default_view",
|
|
20
|
+
"due_date",
|
|
21
|
+
"due_on",
|
|
22
|
+
"is_template",
|
|
23
|
+
"created_at",
|
|
24
|
+
"modified_at",
|
|
25
|
+
"start_on",
|
|
26
|
+
"archived",
|
|
27
|
+
"public",
|
|
28
|
+
"members",
|
|
29
|
+
"followers",
|
|
30
|
+
"color",
|
|
31
|
+
"notes",
|
|
32
|
+
"icon",
|
|
33
|
+
"permalink_url",
|
|
34
|
+
"workspace",
|
|
35
|
+
"team",
|
|
36
|
+
"resource_type",
|
|
37
|
+
"current_status_update",
|
|
38
|
+
"custom_field_settings",
|
|
39
|
+
"completed",
|
|
40
|
+
"completed_at",
|
|
41
|
+
"completed_by",
|
|
42
|
+
"created_from_template",
|
|
43
|
+
"project_brief",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# List of section fields to be retrieved from Asana API
|
|
47
|
+
SECTION_FIELDS = (
|
|
48
|
+
"gid",
|
|
49
|
+
"resource_type",
|
|
50
|
+
"name",
|
|
51
|
+
"created_at",
|
|
52
|
+
"project",
|
|
53
|
+
"projects",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# List of tag fields to be retrieved from Asana API
|
|
57
|
+
TAG_FIELDS = (
|
|
58
|
+
"gid",
|
|
59
|
+
"resource_type",
|
|
60
|
+
"created_at",
|
|
61
|
+
"followers",
|
|
62
|
+
"name",
|
|
63
|
+
"color",
|
|
64
|
+
"notes",
|
|
65
|
+
"permalink_url",
|
|
66
|
+
"workspace",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# List of task fields to be retrieved from Asana API
|
|
70
|
+
TASK_FIELDS = (
|
|
71
|
+
"gid",
|
|
72
|
+
"resource_type",
|
|
73
|
+
"name",
|
|
74
|
+
"approval_status",
|
|
75
|
+
"assignee_status",
|
|
76
|
+
"created_at",
|
|
77
|
+
"assignee",
|
|
78
|
+
"start_on",
|
|
79
|
+
"start_at",
|
|
80
|
+
"due_on",
|
|
81
|
+
"due_at",
|
|
82
|
+
"completed",
|
|
83
|
+
"completed_at",
|
|
84
|
+
"completed_by",
|
|
85
|
+
"modified_at",
|
|
86
|
+
"dependencies",
|
|
87
|
+
"dependents",
|
|
88
|
+
"external",
|
|
89
|
+
"notes",
|
|
90
|
+
"num_subtasks",
|
|
91
|
+
"resource_subtype",
|
|
92
|
+
"followers",
|
|
93
|
+
"parent",
|
|
94
|
+
"permalink_url",
|
|
95
|
+
"tags",
|
|
96
|
+
"workspace",
|
|
97
|
+
"custom_fields",
|
|
98
|
+
"project",
|
|
99
|
+
"memberships",
|
|
100
|
+
"memberships.project.name",
|
|
101
|
+
"memberships.section.name",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# List of story fields to be retrieved from Asana API
|
|
105
|
+
STORY_FIELDS = (
|
|
106
|
+
"gid",
|
|
107
|
+
"resource_type",
|
|
108
|
+
"created_at",
|
|
109
|
+
"created_by",
|
|
110
|
+
"resource_subtype",
|
|
111
|
+
"text",
|
|
112
|
+
"is_pinned",
|
|
113
|
+
"assignee",
|
|
114
|
+
"dependency",
|
|
115
|
+
"follower",
|
|
116
|
+
"new_section",
|
|
117
|
+
"old_section",
|
|
118
|
+
"new_text_value",
|
|
119
|
+
"old_text_value",
|
|
120
|
+
"preview",
|
|
121
|
+
"project",
|
|
122
|
+
"source",
|
|
123
|
+
"story",
|
|
124
|
+
"tag",
|
|
125
|
+
"target",
|
|
126
|
+
"task",
|
|
127
|
+
"sticker_name",
|
|
128
|
+
"custom_field",
|
|
129
|
+
"type",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# List of team fields to be retrieved from Asana API
|
|
133
|
+
TEAMS_FIELD = (
|
|
134
|
+
"gid",
|
|
135
|
+
"resource_type",
|
|
136
|
+
"name",
|
|
137
|
+
"description",
|
|
138
|
+
"organization",
|
|
139
|
+
"permalink_url",
|
|
140
|
+
"visibility",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# List of user fields to be retrieved from Asana API
|
|
144
|
+
USER_FIELDS = ("gid", "resource_type", "name", "email", "photo", "workspaces")
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
import dlt
|
|
6
|
+
from boto3.dynamodb.conditions import Attr
|
|
7
|
+
from dlt.common.configuration.specs import AwsCredentials
|
|
8
|
+
|
|
9
|
+
PAGINATION_KEY = "LastEvaluatedKey"
|
|
10
|
+
FILTER_KEY = "FilterExpression"
|
|
11
|
+
DATA_KEY = "Items"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class TableSchema:
|
|
16
|
+
primary_key: Optional[str]
|
|
17
|
+
sort_key: Optional[str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parseSchema(table) -> TableSchema:
|
|
21
|
+
schema = TableSchema(None, None)
|
|
22
|
+
for key in table.key_schema:
|
|
23
|
+
match key["KeyType"]:
|
|
24
|
+
case "HASH":
|
|
25
|
+
schema.primary_key = key["AttributeName"]
|
|
26
|
+
case "RANGE":
|
|
27
|
+
schema.sort_key = key["AttributeName"]
|
|
28
|
+
|
|
29
|
+
if schema.primary_key is None:
|
|
30
|
+
raise ValueError(f"Table {table.name} has no primary key!")
|
|
31
|
+
|
|
32
|
+
return schema
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dlt.source
|
|
36
|
+
def dynamodb(
|
|
37
|
+
table_name: str,
|
|
38
|
+
credentials: AwsCredentials,
|
|
39
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
40
|
+
):
|
|
41
|
+
sesh = boto3.Session(
|
|
42
|
+
aws_access_key_id=credentials.aws_access_key_id,
|
|
43
|
+
aws_secret_access_key=credentials.aws_secret_access_key,
|
|
44
|
+
region_name=credentials.region_name,
|
|
45
|
+
)
|
|
46
|
+
db = sesh.resource("dynamodb", endpoint_url=credentials.endpoint_url)
|
|
47
|
+
table = db.Table(table_name)
|
|
48
|
+
schema = parseSchema(table)
|
|
49
|
+
resource = dlt.resource(
|
|
50
|
+
dynamodb_table,
|
|
51
|
+
primary_key=schema.primary_key,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
yield resource(table, incremental)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def dynamodb_table(
|
|
58
|
+
table,
|
|
59
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
60
|
+
):
|
|
61
|
+
args = build_scan_args(incremental)
|
|
62
|
+
scan = table.scan(**args)
|
|
63
|
+
while True:
|
|
64
|
+
yield from scan[DATA_KEY]
|
|
65
|
+
if PAGINATION_KEY not in scan:
|
|
66
|
+
break
|
|
67
|
+
scan = table.scan(ExclusiveStartKey=scan[PAGINATION_KEY], **args)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def build_scan_args(
|
|
71
|
+
incremental: Optional[dlt.sources.incremental] = None,
|
|
72
|
+
):
|
|
73
|
+
scan_args = {}
|
|
74
|
+
|
|
75
|
+
if incremental is None:
|
|
76
|
+
return scan_args
|
|
77
|
+
|
|
78
|
+
if incremental.last_value:
|
|
79
|
+
criteria = Attr(incremental.cursor_path).gte(incremental.last_value)
|
|
80
|
+
if incremental.end_value:
|
|
81
|
+
criteria = Attr(incremental.cursor_path).between(
|
|
82
|
+
incremental.last_value, incremental.end_value
|
|
83
|
+
)
|
|
84
|
+
scan_args[FILTER_KEY] = criteria
|
|
85
|
+
|
|
86
|
+
return scan_args
|
ingestr/src/factory.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Protocol
|
|
1
|
+
from typing import Dict, Protocol, Type
|
|
2
2
|
from urllib.parse import urlparse
|
|
3
3
|
|
|
4
4
|
from dlt.common.destination import Destination
|
|
@@ -20,7 +20,9 @@ from ingestr.src.sources import (
|
|
|
20
20
|
AirtableSource,
|
|
21
21
|
AppsflyerSource,
|
|
22
22
|
ArrowMemoryMappedSource,
|
|
23
|
+
AsanaSource,
|
|
23
24
|
ChessSource,
|
|
25
|
+
DynamoDBSource,
|
|
24
26
|
FacebookAdsSource,
|
|
25
27
|
GoogleSheetsSource,
|
|
26
28
|
GorgiasSource,
|
|
@@ -35,6 +37,7 @@ from ingestr.src.sources import (
|
|
|
35
37
|
SlackSource,
|
|
36
38
|
SqlSource,
|
|
37
39
|
StripeAnalyticsSource,
|
|
40
|
+
TikTokSource,
|
|
38
41
|
ZendeskSource,
|
|
39
42
|
)
|
|
40
43
|
|
|
@@ -92,6 +95,46 @@ def parse_scheme_from_uri(uri: str) -> str:
|
|
|
92
95
|
class SourceDestinationFactory:
|
|
93
96
|
source_scheme: str
|
|
94
97
|
destination_scheme: str
|
|
98
|
+
sources: Dict[str, Type[SourceProtocol]] = {
|
|
99
|
+
"csv": LocalCsvSource,
|
|
100
|
+
"mongodb": MongoDbSource,
|
|
101
|
+
"notion": NotionSource,
|
|
102
|
+
"gsheets": GoogleSheetsSource,
|
|
103
|
+
"shopify": ShopifySource,
|
|
104
|
+
"gorgias": GorgiasSource,
|
|
105
|
+
"chess": ChessSource,
|
|
106
|
+
"stripe": StripeAnalyticsSource,
|
|
107
|
+
"facebookads": FacebookAdsSource,
|
|
108
|
+
"slack": SlackSource,
|
|
109
|
+
"hubspot": HubspotSource,
|
|
110
|
+
"airtable": AirtableSource,
|
|
111
|
+
"klaviyo": KlaviyoSource,
|
|
112
|
+
"appsflyer": AppsflyerSource,
|
|
113
|
+
"kafka": KafkaSource,
|
|
114
|
+
"adjust": AdjustSource,
|
|
115
|
+
"zendesk": ZendeskSource,
|
|
116
|
+
"mmap": ArrowMemoryMappedSource,
|
|
117
|
+
"s3": S3Source,
|
|
118
|
+
"dynamodb": DynamoDBSource,
|
|
119
|
+
"asana": AsanaSource,
|
|
120
|
+
"tiktok": TikTokSource,
|
|
121
|
+
}
|
|
122
|
+
destinations: Dict[str, Type[DestinationProtocol]] = {
|
|
123
|
+
"bigquery": BigQueryDestination,
|
|
124
|
+
"databricks": DatabricksDestination,
|
|
125
|
+
"duckdb": DuckDBDestination,
|
|
126
|
+
"mssql": MsSQLDestination,
|
|
127
|
+
"postgres": PostgresDestination,
|
|
128
|
+
"postgresql": PostgresDestination,
|
|
129
|
+
"postgresql+psycopg2": PostgresDestination,
|
|
130
|
+
"redshift": RedshiftDestination,
|
|
131
|
+
"redshift+psycopg2": RedshiftDestination,
|
|
132
|
+
"redshift+redshift_connector": RedshiftDestination,
|
|
133
|
+
"snowflake": SnowflakeDestination,
|
|
134
|
+
"synapse": SynapseDestination,
|
|
135
|
+
"csv": CsvDestination,
|
|
136
|
+
"athena": AthenaDestination,
|
|
137
|
+
}
|
|
95
138
|
|
|
96
139
|
def __init__(self, source_uri: str, destination_uri: str):
|
|
97
140
|
self.source_uri = source_uri
|
|
@@ -104,67 +147,14 @@ class SourceDestinationFactory:
|
|
|
104
147
|
def get_source(self) -> SourceProtocol:
|
|
105
148
|
if self.source_scheme in SQL_SOURCE_SCHEMES:
|
|
106
149
|
return SqlSource()
|
|
107
|
-
elif self.source_scheme
|
|
108
|
-
return
|
|
109
|
-
elif self.source_scheme == "mongodb":
|
|
110
|
-
return MongoDbSource()
|
|
111
|
-
elif self.source_scheme == "notion":
|
|
112
|
-
return NotionSource()
|
|
113
|
-
elif self.source_scheme == "gsheets":
|
|
114
|
-
return GoogleSheetsSource()
|
|
115
|
-
elif self.source_scheme == "shopify":
|
|
116
|
-
return ShopifySource()
|
|
117
|
-
elif self.source_scheme == "gorgias":
|
|
118
|
-
return GorgiasSource()
|
|
119
|
-
elif self.source_scheme == "chess":
|
|
120
|
-
return ChessSource()
|
|
121
|
-
elif self.source_scheme == "stripe":
|
|
122
|
-
return StripeAnalyticsSource()
|
|
123
|
-
elif self.source_scheme == "facebookads":
|
|
124
|
-
return FacebookAdsSource()
|
|
125
|
-
elif self.source_scheme == "slack":
|
|
126
|
-
return SlackSource()
|
|
127
|
-
elif self.source_scheme == "hubspot":
|
|
128
|
-
return HubspotSource()
|
|
129
|
-
elif self.source_scheme == "airtable":
|
|
130
|
-
return AirtableSource()
|
|
131
|
-
elif self.source_scheme == "klaviyo":
|
|
132
|
-
return KlaviyoSource()
|
|
133
|
-
elif self.source_scheme == "appsflyer":
|
|
134
|
-
return AppsflyerSource()
|
|
135
|
-
elif self.source_scheme == "kafka":
|
|
136
|
-
return KafkaSource()
|
|
137
|
-
elif self.source_scheme == "adjust":
|
|
138
|
-
return AdjustSource()
|
|
139
|
-
elif self.source_scheme == "zendesk":
|
|
140
|
-
return ZendeskSource()
|
|
141
|
-
elif self.source_scheme == "mmap":
|
|
142
|
-
return ArrowMemoryMappedSource()
|
|
143
|
-
elif self.source_scheme == "s3":
|
|
144
|
-
return S3Source()
|
|
150
|
+
elif self.source_scheme in self.sources:
|
|
151
|
+
return self.sources[self.source_scheme]()
|
|
145
152
|
else:
|
|
146
153
|
raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
|
|
147
154
|
|
|
148
155
|
def get_destination(self) -> DestinationProtocol:
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
"databricks": DatabricksDestination(),
|
|
152
|
-
"duckdb": DuckDBDestination(),
|
|
153
|
-
"mssql": MsSQLDestination(),
|
|
154
|
-
"postgres": PostgresDestination(),
|
|
155
|
-
"postgresql": PostgresDestination(),
|
|
156
|
-
"postgresql+psycopg2": PostgresDestination(),
|
|
157
|
-
"redshift": RedshiftDestination(),
|
|
158
|
-
"redshift+psycopg2": RedshiftDestination(),
|
|
159
|
-
"redshift+redshift_connector": RedshiftDestination(),
|
|
160
|
-
"snowflake": SnowflakeDestination(),
|
|
161
|
-
"synapse": SynapseDestination(),
|
|
162
|
-
"csv": CsvDestination(),
|
|
163
|
-
"athena": AthenaDestination(),
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
if self.destination_scheme in match:
|
|
167
|
-
return match[self.destination_scheme]
|
|
156
|
+
if self.destination_scheme in self.destinations:
|
|
157
|
+
return self.destinations[self.destination_scheme]()
|
|
168
158
|
else:
|
|
169
159
|
raise ValueError(
|
|
170
160
|
f"Unsupported destination scheme: {self.destination_scheme}"
|
ingestr/src/sources.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import csv
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
4
6
|
from datetime import date
|
|
5
7
|
from typing import Any, Callable, Optional
|
|
6
|
-
from urllib.parse import parse_qs, urlparse
|
|
8
|
+
from urllib.parse import ParseResult, parse_qs, quote, urlparse
|
|
7
9
|
|
|
8
10
|
import dlt
|
|
9
11
|
import pendulum
|
|
@@ -20,7 +22,9 @@ from ingestr.src.adjust.adjust_helpers import parse_filters
|
|
|
20
22
|
from ingestr.src.airtable import airtable_source
|
|
21
23
|
from ingestr.src.appsflyer._init_ import appsflyer_source
|
|
22
24
|
from ingestr.src.arrow import memory_mapped_arrow
|
|
25
|
+
from ingestr.src.asana_source import asana_source
|
|
23
26
|
from ingestr.src.chess import source
|
|
27
|
+
from ingestr.src.dynamodb import dynamodb
|
|
24
28
|
from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
|
|
25
29
|
from ingestr.src.filesystem import readers
|
|
26
30
|
from ingestr.src.filters import table_adapter_exclude_columns
|
|
@@ -36,6 +40,8 @@ from ingestr.src.shopify import shopify_source
|
|
|
36
40
|
from ingestr.src.slack import slack_source
|
|
37
41
|
from ingestr.src.stripe_analytics import stripe_source
|
|
38
42
|
from ingestr.src.table_definition import table_string_to_dataclass
|
|
43
|
+
from ingestr.src.tiktok_ads import tiktok_source
|
|
44
|
+
from ingestr.src.time import isotime
|
|
39
45
|
from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
|
|
40
46
|
from ingestr.src.zendesk.helpers.credentials import (
|
|
41
47
|
ZendeskCredentialsOAuth,
|
|
@@ -114,8 +120,6 @@ class ArrowMemoryMappedSource:
|
|
|
114
120
|
return False
|
|
115
121
|
|
|
116
122
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
117
|
-
import os
|
|
118
|
-
|
|
119
123
|
incremental = None
|
|
120
124
|
if kwargs.get("incremental_key"):
|
|
121
125
|
start_value = kwargs.get("interval_start")
|
|
@@ -952,7 +956,7 @@ class S3Source:
|
|
|
952
956
|
)
|
|
953
957
|
|
|
954
958
|
parsed_uri = urlparse(uri)
|
|
955
|
-
source_fields = parse_qs(parsed_uri.query)
|
|
959
|
+
source_fields = parse_qs(quote(parsed_uri.query, safe="=&"))
|
|
956
960
|
access_key_id = source_fields.get("access_key_id")
|
|
957
961
|
if not access_key_id:
|
|
958
962
|
raise ValueError("access_key_id is required to connect to S3")
|
|
@@ -994,3 +998,176 @@ class S3Source:
|
|
|
994
998
|
return readers(
|
|
995
999
|
bucket_url=bucket_url, credentials=aws_credentials, file_glob=path_to_file
|
|
996
1000
|
).with_resources(endpoint)
|
|
1001
|
+
|
|
1002
|
+
|
|
1003
|
+
class TikTokSource:
|
|
1004
|
+
# tittok://?access_token=<access_token>&advertiser_id=<advertiser_id>
|
|
1005
|
+
def handles_incrementality(self) -> bool:
|
|
1006
|
+
return True
|
|
1007
|
+
|
|
1008
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1009
|
+
endpoint = "custom_reports"
|
|
1010
|
+
|
|
1011
|
+
parsed_uri = urlparse(uri)
|
|
1012
|
+
source_fields = parse_qs(parsed_uri.query)
|
|
1013
|
+
|
|
1014
|
+
access_token = source_fields.get("access_token")
|
|
1015
|
+
if not access_token:
|
|
1016
|
+
raise ValueError("access_token is required to connect to TikTok")
|
|
1017
|
+
|
|
1018
|
+
time_zone = source_fields.get("time_zone", "UTC")
|
|
1019
|
+
|
|
1020
|
+
advertiser_id = source_fields.get("advertiser_id")
|
|
1021
|
+
if not advertiser_id:
|
|
1022
|
+
raise ValueError("advertiser_id is required to connect to TikTok")
|
|
1023
|
+
|
|
1024
|
+
start_date = pendulum.now().subtract(days=90).in_tz(time_zone[0])
|
|
1025
|
+
end_date = ensure_pendulum_datetime(pendulum.now()).in_tz(time_zone[0])
|
|
1026
|
+
|
|
1027
|
+
interval_start = kwargs.get("interval_start")
|
|
1028
|
+
if interval_start is not None:
|
|
1029
|
+
start_date = ensure_pendulum_datetime(interval_start).in_tz(time_zone[0])
|
|
1030
|
+
|
|
1031
|
+
interval_end = kwargs.get("interval_end")
|
|
1032
|
+
if interval_end is not None:
|
|
1033
|
+
end_date = ensure_pendulum_datetime(interval_end).in_tz(time_zone[0])
|
|
1034
|
+
|
|
1035
|
+
page_size = kwargs.get("page_size")
|
|
1036
|
+
if page_size is not None and not isinstance(page_size, int):
|
|
1037
|
+
page_size = int(page_size)
|
|
1038
|
+
|
|
1039
|
+
if page_size > 1000:
|
|
1040
|
+
page_size = 1000
|
|
1041
|
+
|
|
1042
|
+
if table.startswith("custom:"):
|
|
1043
|
+
fields = table.split(":", 3)
|
|
1044
|
+
if len(fields) != 3 and len(fields) != 4:
|
|
1045
|
+
raise ValueError(
|
|
1046
|
+
"Invalid TikTok custom table format. Expected format: custom:<dimensions>,<metrics> or custom:<dimensions>:<metrics>:<filters>"
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
dimensions = fields[1].replace(" ", "").split(",")
|
|
1050
|
+
if (
|
|
1051
|
+
"campaign_id" not in dimensions
|
|
1052
|
+
and "advertiser_id" not in dimensions
|
|
1053
|
+
and "adgroup_id" not in dimensions
|
|
1054
|
+
and "ad_id" not in dimensions
|
|
1055
|
+
):
|
|
1056
|
+
raise ValueError(
|
|
1057
|
+
"You must provide one ID dimension. Please use one ID dimension from the following options: [campaign_id, advertiser_id, adgroup_id, ad_id]"
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
metrics = fields[2].replace(" ", "").split(",")
|
|
1061
|
+
filters = []
|
|
1062
|
+
if len(fields) == 4:
|
|
1063
|
+
filters = fields[3].replace(" ", "").split(",")
|
|
1064
|
+
return tiktok_source(
|
|
1065
|
+
start_date=start_date,
|
|
1066
|
+
end_date=end_date,
|
|
1067
|
+
access_token=access_token[0],
|
|
1068
|
+
advertiser_id=advertiser_id[0],
|
|
1069
|
+
time_zone=time_zone[0],
|
|
1070
|
+
dimensions=dimensions,
|
|
1071
|
+
metrics=metrics,
|
|
1072
|
+
filters=filters,
|
|
1073
|
+
page_size=page_size,
|
|
1074
|
+
).with_resources(endpoint)
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
class AsanaSource:
|
|
1078
|
+
resources = [
|
|
1079
|
+
"workspaces",
|
|
1080
|
+
"projects",
|
|
1081
|
+
"sections",
|
|
1082
|
+
"tags",
|
|
1083
|
+
"tasks",
|
|
1084
|
+
"stories",
|
|
1085
|
+
"teams",
|
|
1086
|
+
"users",
|
|
1087
|
+
]
|
|
1088
|
+
|
|
1089
|
+
def handles_incrementality(self) -> bool:
|
|
1090
|
+
return False
|
|
1091
|
+
|
|
1092
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1093
|
+
parsed_uri = urlparse(uri)
|
|
1094
|
+
params = parse_qs(parsed_uri.query)
|
|
1095
|
+
|
|
1096
|
+
workspace = parsed_uri.hostname
|
|
1097
|
+
access_token = params.get("access_token")
|
|
1098
|
+
|
|
1099
|
+
if not workspace:
|
|
1100
|
+
raise ValueError("workspace ID must be specified in the URI")
|
|
1101
|
+
|
|
1102
|
+
if not access_token:
|
|
1103
|
+
raise ValueError("access_token is required for connecting to Asana")
|
|
1104
|
+
|
|
1105
|
+
if table not in self.resources:
|
|
1106
|
+
raise ValueError(
|
|
1107
|
+
f"Resource '{table}' is not supported for Asana source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
dlt.secrets["sources.asana_source.access_token"] = access_token[0]
|
|
1111
|
+
src = asana_source()
|
|
1112
|
+
src.workspaces.add_filter(lambda w: w["gid"] == workspace)
|
|
1113
|
+
return src.with_resources(table)
|
|
1114
|
+
|
|
1115
|
+
|
|
1116
|
+
class DynamoDBSource:
|
|
1117
|
+
AWS_ENDPOINT_PATTERN = re.compile(".*\.(.+)\.amazonaws\.com")
|
|
1118
|
+
|
|
1119
|
+
def infer_aws_region(self, uri: ParseResult) -> Optional[str]:
|
|
1120
|
+
# try to infer from URI
|
|
1121
|
+
matches = self.AWS_ENDPOINT_PATTERN.match(uri.netloc)
|
|
1122
|
+
if matches is not None:
|
|
1123
|
+
return matches[1]
|
|
1124
|
+
|
|
1125
|
+
# else obtain region from query string
|
|
1126
|
+
region = parse_qs(uri.query).get("region")
|
|
1127
|
+
if region is None:
|
|
1128
|
+
return None
|
|
1129
|
+
return region[0]
|
|
1130
|
+
|
|
1131
|
+
def get_endpoint_url(self, url: ParseResult) -> str:
|
|
1132
|
+
if self.AWS_ENDPOINT_PATTERN.match(url.netloc) is not None:
|
|
1133
|
+
return f"https://{url.hostname}"
|
|
1134
|
+
return f"http://{url.netloc}"
|
|
1135
|
+
|
|
1136
|
+
def handles_incrementality(self) -> bool:
|
|
1137
|
+
return False
|
|
1138
|
+
|
|
1139
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1140
|
+
parsed_uri = urlparse(uri)
|
|
1141
|
+
|
|
1142
|
+
region = self.infer_aws_region(parsed_uri)
|
|
1143
|
+
if not region:
|
|
1144
|
+
raise ValueError("region is required to connect to Dynamodb")
|
|
1145
|
+
|
|
1146
|
+
qs = parse_qs(quote(parsed_uri.query, safe="=&"))
|
|
1147
|
+
access_key = qs.get("access_key_id")
|
|
1148
|
+
|
|
1149
|
+
if not access_key:
|
|
1150
|
+
raise ValueError("access_key_id is required to connect to Dynamodb")
|
|
1151
|
+
|
|
1152
|
+
secret_key = qs.get("secret_access_key")
|
|
1153
|
+
if not secret_key:
|
|
1154
|
+
raise ValueError("secret_access_key is required to connect to Dynamodb")
|
|
1155
|
+
|
|
1156
|
+
creds = AwsCredentials(
|
|
1157
|
+
aws_access_key_id=access_key[0],
|
|
1158
|
+
aws_secret_access_key=TSecretStrValue(secret_key[0]),
|
|
1159
|
+
region_name=region,
|
|
1160
|
+
endpoint_url=self.get_endpoint_url(parsed_uri),
|
|
1161
|
+
)
|
|
1162
|
+
|
|
1163
|
+
incremental = None
|
|
1164
|
+
incremental_key = kwargs.get("incremental_key")
|
|
1165
|
+
|
|
1166
|
+
if incremental_key:
|
|
1167
|
+
incremental = dlt.sources.incremental(
|
|
1168
|
+
incremental_key.strip(),
|
|
1169
|
+
initial_value=isotime(kwargs.get("interval_start")),
|
|
1170
|
+
end_value=isotime(kwargs.get("interval_end")),
|
|
1171
|
+
)
|
|
1172
|
+
|
|
1173
|
+
return dynamodb(table, creds, incremental)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import Iterable, Optional
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
import pendulum
|
|
5
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
6
|
+
from dlt.common.typing import TDataItem
|
|
7
|
+
from dlt.sources import DltResource
|
|
8
|
+
|
|
9
|
+
from .tiktok_helpers import TikTokAPI
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_intervals(
|
|
13
|
+
current_date: pendulum.DateTime,
|
|
14
|
+
end_date: pendulum.DateTime,
|
|
15
|
+
interval_days: int,
|
|
16
|
+
):
|
|
17
|
+
intervals = []
|
|
18
|
+
while current_date <= end_date:
|
|
19
|
+
interval_end = min(current_date.add(days=interval_days), end_date)
|
|
20
|
+
intervals.append((current_date, interval_end))
|
|
21
|
+
current_date = interval_end.add(days=1)
|
|
22
|
+
|
|
23
|
+
return intervals
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def fetch_tiktok_reports(
|
|
27
|
+
tiktok_api: TikTokAPI,
|
|
28
|
+
current_date: pendulum.DateTime,
|
|
29
|
+
interval_end: pendulum.DateTime,
|
|
30
|
+
advertiser_id: str,
|
|
31
|
+
dimensions: list[str],
|
|
32
|
+
metrics: list[str],
|
|
33
|
+
filters: Optional[dict] | None,
|
|
34
|
+
) -> Iterable[TDataItem]:
|
|
35
|
+
try:
|
|
36
|
+
yield from tiktok_api.fetch_pages(
|
|
37
|
+
advertiser_id=advertiser_id,
|
|
38
|
+
start_time=current_date,
|
|
39
|
+
end_time=interval_end,
|
|
40
|
+
dimensions=dimensions,
|
|
41
|
+
metrics=metrics,
|
|
42
|
+
filters=None,
|
|
43
|
+
)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
raise RuntimeError(f"Error fetching TikTok report: {e}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dlt.source(max_table_nesting=0)
|
|
49
|
+
def tiktok_source(
|
|
50
|
+
start_date: pendulum.DateTime,
|
|
51
|
+
end_date: pendulum.DateTime,
|
|
52
|
+
access_token: str,
|
|
53
|
+
advertiser_id: str,
|
|
54
|
+
time_zone: str,
|
|
55
|
+
page_size: int,
|
|
56
|
+
dimensions: list[str],
|
|
57
|
+
metrics: list[str],
|
|
58
|
+
filters=None,
|
|
59
|
+
) -> DltResource:
|
|
60
|
+
tiktok_api = TikTokAPI(
|
|
61
|
+
access_token=access_token, time_zone=time_zone, page_size=page_size
|
|
62
|
+
)
|
|
63
|
+
incremental_loading_param = ""
|
|
64
|
+
is_incremental = False
|
|
65
|
+
interval_days = 365
|
|
66
|
+
|
|
67
|
+
if "stat_time_day" in dimensions:
|
|
68
|
+
incremental_loading_param = "stat_time_day"
|
|
69
|
+
is_incremental = True
|
|
70
|
+
interval_days = 30
|
|
71
|
+
|
|
72
|
+
if "stat_time_hour" in dimensions:
|
|
73
|
+
incremental_loading_param = "stat_time_hour"
|
|
74
|
+
is_incremental = True
|
|
75
|
+
interval_days = 0
|
|
76
|
+
|
|
77
|
+
@dlt.resource(write_disposition="merge", primary_key=dimensions)
|
|
78
|
+
def custom_reports(
|
|
79
|
+
datetime=dlt.sources.incremental(incremental_loading_param, start_date)
|
|
80
|
+
if is_incremental
|
|
81
|
+
else None,
|
|
82
|
+
) -> Iterable[TDataItem]:
|
|
83
|
+
current_date = start_date.in_tz(time_zone)
|
|
84
|
+
|
|
85
|
+
if datetime is not None:
|
|
86
|
+
datetime_str = datetime.last_value
|
|
87
|
+
current_date = ensure_pendulum_datetime(datetime_str).in_tz(time_zone)
|
|
88
|
+
|
|
89
|
+
list_of_interval = find_intervals(
|
|
90
|
+
current_date=current_date,
|
|
91
|
+
end_date=end_date,
|
|
92
|
+
interval_days=interval_days,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
for start, end in list_of_interval:
|
|
96
|
+
yield from fetch_tiktok_reports(
|
|
97
|
+
tiktok_api=tiktok_api,
|
|
98
|
+
current_date=start,
|
|
99
|
+
interval_end=end,
|
|
100
|
+
advertiser_id=advertiser_id,
|
|
101
|
+
dimensions=dimensions,
|
|
102
|
+
metrics=metrics,
|
|
103
|
+
filters=None,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return custom_reports
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
5
|
+
from dlt.sources.helpers.requests import Client
|
|
6
|
+
|
|
7
|
+
BASE_URL = "https://business-api.tiktok.com/open_api/v1.3/report/integrated/get/"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def retry_on_limit(
|
|
11
|
+
response: requests.Response | None, exception: BaseException | None
|
|
12
|
+
) -> bool:
|
|
13
|
+
if response is None:
|
|
14
|
+
return False
|
|
15
|
+
return response.status_code == 429
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_client() -> requests.Session:
|
|
19
|
+
return Client(
|
|
20
|
+
request_timeout=10.0,
|
|
21
|
+
raise_for_status=False,
|
|
22
|
+
retry_condition=retry_on_limit,
|
|
23
|
+
request_max_attempts=12,
|
|
24
|
+
request_backoff_factor=2,
|
|
25
|
+
).session
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def flat_structure(items, time_zone="UTC"):
|
|
29
|
+
for item in items:
|
|
30
|
+
if "dimensions" in item:
|
|
31
|
+
for key, value in item["dimensions"].items():
|
|
32
|
+
if key == "stat_time_day":
|
|
33
|
+
item["stat_time_day"] = ensure_pendulum_datetime(value).in_tz(
|
|
34
|
+
time_zone
|
|
35
|
+
)
|
|
36
|
+
elif key == "stat_time_hour":
|
|
37
|
+
item["stat_time_hour"] = ensure_pendulum_datetime(value).in_tz(
|
|
38
|
+
time_zone
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
item[key] = value
|
|
42
|
+
del item["dimensions"]
|
|
43
|
+
|
|
44
|
+
for key, value in item["metrics"].items():
|
|
45
|
+
item[key] = value
|
|
46
|
+
del item["metrics"]
|
|
47
|
+
|
|
48
|
+
return items
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class TikTokAPI:
|
|
52
|
+
def __init__(self, access_token, time_zone, page_size):
|
|
53
|
+
self.headers = {
|
|
54
|
+
"Access-Token": access_token,
|
|
55
|
+
}
|
|
56
|
+
self.time_zone = time_zone
|
|
57
|
+
self.page_size = page_size
|
|
58
|
+
|
|
59
|
+
def fetch_pages(
|
|
60
|
+
self, advertiser_id: str, start_time, end_time, dimensions, metrics, filters
|
|
61
|
+
):
|
|
62
|
+
data_level_mapping = {
|
|
63
|
+
"advertiser_id": "AUCTION_ADVERTISER",
|
|
64
|
+
"campaign_id": "AUCTION_CAMPAIGN",
|
|
65
|
+
"adgroup_id": "AUCTION_ADGROUP",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
data_level = "AUCTION_AD"
|
|
69
|
+
for id_dimension in dimensions:
|
|
70
|
+
if id_dimension in data_level_mapping:
|
|
71
|
+
data_level = data_level_mapping[id_dimension]
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
current_page = 1
|
|
75
|
+
start_time = ensure_pendulum_datetime(start_time).to_date_string()
|
|
76
|
+
end_time = ensure_pendulum_datetime(end_time).to_date_string()
|
|
77
|
+
|
|
78
|
+
self.params = {
|
|
79
|
+
"advertiser_id": advertiser_id,
|
|
80
|
+
"report_type": "BASIC",
|
|
81
|
+
"data_level": data_level,
|
|
82
|
+
"start_date": start_time,
|
|
83
|
+
"end_date": end_time,
|
|
84
|
+
"page_size": self.page_size,
|
|
85
|
+
"dimensions": json.dumps(dimensions),
|
|
86
|
+
"metrics": json.dumps(metrics),
|
|
87
|
+
}
|
|
88
|
+
client = create_client()
|
|
89
|
+
while True:
|
|
90
|
+
self.params["page"] = current_page
|
|
91
|
+
response = client.get(
|
|
92
|
+
url=BASE_URL, headers=self.headers, params=self.params
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
result = response.json()
|
|
96
|
+
if result.get("message") != "OK":
|
|
97
|
+
raise ValueError(result.get("message", ""))
|
|
98
|
+
|
|
99
|
+
result_data = result.get("data", {})
|
|
100
|
+
items = result_data.get("list", [])
|
|
101
|
+
|
|
102
|
+
flat_structure(items=items, time_zone=self.time_zone)
|
|
103
|
+
|
|
104
|
+
yield items
|
|
105
|
+
|
|
106
|
+
page_info = result_data.get("page_info", {})
|
|
107
|
+
total_pages = page_info.get("total_page", 1)
|
|
108
|
+
|
|
109
|
+
if current_page >= total_pages:
|
|
110
|
+
break
|
|
111
|
+
|
|
112
|
+
current_page += 1
|
ingestr/src/time.py
ADDED
ingestr/src/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.12.2"
|
ingestr/src/zendesk/__init__.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.2
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
7
7
|
Author-email: Burak Karakan <burak.karakan@getbruin.com>
|
|
8
|
+
License-File: LICENSE.md
|
|
8
9
|
Classifier: Development Status :: 4 - Beta
|
|
9
10
|
Classifier: Environment :: Console
|
|
10
11
|
Classifier: Intended Audience :: Developers
|
|
@@ -13,6 +14,7 @@ Classifier: Operating System :: OS Independent
|
|
|
13
14
|
Classifier: Programming Language :: Python :: 3
|
|
14
15
|
Classifier: Topic :: Database
|
|
15
16
|
Requires-Python: >=3.9
|
|
17
|
+
Requires-Dist: asana==3.2.3
|
|
16
18
|
Requires-Dist: confluent-kafka>=2.6.1
|
|
17
19
|
Requires-Dist: databricks-sql-connector==2.9.3
|
|
18
20
|
Requires-Dist: dlt==1.4.0
|
|
@@ -199,7 +201,7 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
199
201
|
<tr>
|
|
200
202
|
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
201
203
|
</tr>
|
|
202
|
-
|
|
204
|
+
<td>Adjust</td>
|
|
203
205
|
<td>✅</td>
|
|
204
206
|
<td>-</td>
|
|
205
207
|
<tr>
|
|
@@ -207,17 +209,27 @@ Pull requests are welcome. However, please open an issue first to discuss what y
|
|
|
207
209
|
<td>✅</td>
|
|
208
210
|
<td>-</td>
|
|
209
211
|
</tr>
|
|
210
|
-
|
|
212
|
+
<tr>
|
|
211
213
|
<td>AppsFlyer</td>
|
|
212
214
|
<td>✅</td>
|
|
213
215
|
<td>-</td>
|
|
214
216
|
</tr>
|
|
217
|
+
<tr>
|
|
218
|
+
<td>Asana</td>
|
|
219
|
+
<td>✅</td>
|
|
220
|
+
<td>-</td>
|
|
221
|
+
</tr>
|
|
215
222
|
<tr>
|
|
216
223
|
<td>Chess.com</td>
|
|
217
224
|
<td>✅</td>
|
|
218
225
|
<td>-</td>
|
|
219
226
|
</tr>
|
|
220
|
-
|
|
227
|
+
<tr>
|
|
228
|
+
<td>DynamoDB</td>
|
|
229
|
+
<td>✅</td>
|
|
230
|
+
<td>-</td>
|
|
231
|
+
</tr>
|
|
232
|
+
<tr>
|
|
221
233
|
<td>Facebook Ads</td>
|
|
222
234
|
<td>✅</td>
|
|
223
235
|
<td>-</td>
|
|
@@ -1,20 +1,25 @@
|
|
|
1
1
|
ingestr/main.py,sha256=wkU2uLMy1q8YarJ9mXNfJepeRjp6AuPDeNDOmMUt6n0,22309
|
|
2
2
|
ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
|
|
3
3
|
ingestr/src/destinations.py,sha256=zcHJIIHAZmcD9sJomd6G1Bc-1KsxnBD2aByOSV_9L3g,8850
|
|
4
|
-
ingestr/src/factory.py,sha256=
|
|
4
|
+
ingestr/src/factory.py,sha256=UyE1TzTHn_V8JZno5SSYfQsho1eFYzzvOylogw4S49E,4389
|
|
5
5
|
ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
|
|
6
|
-
ingestr/src/sources.py,sha256=
|
|
6
|
+
ingestr/src/sources.py,sha256=QCyfkhLl5jgmosZUeh4BTrmqHk74Vus7zLgk_MBdPhc,41096
|
|
7
7
|
ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
|
|
8
|
-
ingestr/src/
|
|
8
|
+
ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
|
|
9
|
+
ingestr/src/version.py,sha256=NJQQPiZZfrBXFMqZlsia0JrhloS2PexbdxYYUs0c2Us,23
|
|
9
10
|
ingestr/src/adjust/__init__.py,sha256=NaRNwDhItG8Q7vUHw7zQvyfWjmT32M0CSc5ufjmBM9U,3067
|
|
10
11
|
ingestr/src/adjust/adjust_helpers.py,sha256=-tmmxy9k3wms-ZEIgxmlp2cAQ2X_O1lgjY1128bbMu4,3224
|
|
11
12
|
ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
|
|
12
13
|
ingestr/src/appsflyer/_init_.py,sha256=ne2-9FQ654Drtd3GkKQv8Bwb6LEqCnJw49MfO5Jyzgs,739
|
|
13
14
|
ingestr/src/appsflyer/client.py,sha256=TNmwakLzmO6DZW3wcfLfQRl7aNBHgFqSsk4ef-MmJ1w,3084
|
|
14
15
|
ingestr/src/arrow/__init__.py,sha256=AgU7S9Ra3ZeeG00Mf32zxO5sgMFfRnTdOSirUJ1Pu10,2976
|
|
16
|
+
ingestr/src/asana_source/__init__.py,sha256=Y4Ti_876Yong420fQ2o4A97TdgrZNlZVxlTMLyXdSjA,8116
|
|
17
|
+
ingestr/src/asana_source/helpers.py,sha256=PukcdDQWIGqnGxuuobbLw4hUy4-t6gxXg_XywR7Lg9M,375
|
|
18
|
+
ingestr/src/asana_source/settings.py,sha256=-2tpdkwh04RvLKFvwQodnFLYn9MaxOO1hsebGnDQMTU,2829
|
|
15
19
|
ingestr/src/chess/__init__.py,sha256=y0Q8aKBigeKf3N7wuB_gadMQjVJzBPUT8Jhp1ObEWjk,6812
|
|
16
20
|
ingestr/src/chess/helpers.py,sha256=v1HTImOMjAF7AzZUPDIuHu00e7ut0o5y1kWcVYo4QZw,549
|
|
17
21
|
ingestr/src/chess/settings.py,sha256=p0RlCGgtXUacPDEvZmwzSWmzX0Apj1riwfz-nrMK89k,158
|
|
22
|
+
ingestr/src/dynamodb/__init__.py,sha256=swhxkeYBbJ35jn1IghCtvYWT2BM33KynVCh_oR4z28A,2264
|
|
18
23
|
ingestr/src/facebook_ads/__init__.py,sha256=ZZyogV48gmhDcC3CYQEsC4qT3Q6JI9IOnMff2NS1M-A,9207
|
|
19
24
|
ingestr/src/facebook_ads/exceptions.py,sha256=4Nlbc0Mv3i5g-9AoyT-n1PIa8IDi3VCTfEAzholx4Wc,115
|
|
20
25
|
ingestr/src/facebook_ads/helpers.py,sha256=ZLbNHiKer5lPb4g3_435XeBJr57Wv0o1KTyBA1mQ100,9068
|
|
@@ -56,7 +61,9 @@ ingestr/src/stripe_analytics/helpers.py,sha256=iqZOyiGIOhOAhVXXU16DP0hkkTKcTrDu6
|
|
|
56
61
|
ingestr/src/stripe_analytics/settings.py,sha256=rl9L5XumxO0pjkZf7MGesXHp4QLRgnz3RWLuDWDBKXo,380
|
|
57
62
|
ingestr/src/telemetry/event.py,sha256=MpWc5tt0lSJ1pWKe9HQ11BHrcPBxSH40l4wjZi9u0tI,924
|
|
58
63
|
ingestr/src/testdata/fakebqcredentials.json,sha256=scc6TUc963KAbKTLZCfcmqVzbtzDCW1_8JNRnyAXyy8,628
|
|
59
|
-
ingestr/src/
|
|
64
|
+
ingestr/src/tiktok_ads/__init__.py,sha256=vJjVxEw3W1Rvc2QDQbox_8Ma0Cp1RT7iKsQ9MAv6Cgc,3036
|
|
65
|
+
ingestr/src/tiktok_ads/tiktok_helpers.py,sha256=lY7yWl_aJh5Hj-bVvt07MHvhfvXnghaGOLhGHF5gLh4,3444
|
|
66
|
+
ingestr/src/zendesk/__init__.py,sha256=C7HkN195DGdOHId2_Sa_kAlcBrUmnVYZUa_tPkiyf1Q,17564
|
|
60
67
|
ingestr/src/zendesk/settings.py,sha256=Vdj706nTJFQ-3KH4nO97iYCQuba3dV3E9gfnmLK6xwU,2294
|
|
61
68
|
ingestr/src/zendesk/helpers/__init__.py,sha256=YTJejCiUjfIcsj9FrkY0l-JGYDI7RRte1Ydq5FDH_0c,888
|
|
62
69
|
ingestr/src/zendesk/helpers/api_helpers.py,sha256=dMkNn4ZQXgJTDOXAAXdmRt41phNFoRhYyPaLJih0pZY,4184
|
|
@@ -70,8 +77,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
|
|
|
70
77
|
ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
|
|
71
78
|
ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
|
|
72
79
|
ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
|
|
73
|
-
ingestr-0.
|
|
74
|
-
ingestr-0.
|
|
75
|
-
ingestr-0.
|
|
76
|
-
ingestr-0.
|
|
77
|
-
ingestr-0.
|
|
80
|
+
ingestr-0.12.2.dist-info/METADATA,sha256=SAZJKqigL1ARQdv3eGX4RZVigZwYJCEcCt36lpvZtsQ,7910
|
|
81
|
+
ingestr-0.12.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
82
|
+
ingestr-0.12.2.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
|
|
83
|
+
ingestr-0.12.2.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
|
|
84
|
+
ingestr-0.12.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|