ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin/__init__.py +262 -0
- ingestr/src/applovin_max/__init__.py +117 -0
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +38 -11
- ingestr/src/buildinfo.py +1 -0
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +520 -33
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +116 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/loader.py +69 -0
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/personio/__init__.py +331 -0
- ingestr/src/personio/helpers.py +86 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +156 -0
- ingestr/src/salesforce/helpers.py +64 -0
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +3132 -212
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/version.py +6 -1
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- ingestr-0.14.104.dist-info/METADATA +563 -0
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.2.dist-info/METADATA +0 -302
- ingestr-0.13.2.dist-info/RECORD +0 -107
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/partition.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from dlt.common.schema.typing import TColumnSchema
|
|
4
|
+
from dlt.sources import DltResource, DltSource
|
|
5
|
+
|
|
6
|
+
import ingestr.src.resource as resource
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def apply_athena_hints(
|
|
10
|
+
source: DltSource | DltResource,
|
|
11
|
+
partition_column: str,
|
|
12
|
+
additional_hints: Dict[str, TColumnSchema] = {},
|
|
13
|
+
) -> None:
|
|
14
|
+
from dlt.destinations.adapters import athena_adapter, athena_partition
|
|
15
|
+
|
|
16
|
+
def _apply_partition_hint(resource: DltResource) -> None:
|
|
17
|
+
columns = resource.columns if resource.columns else {}
|
|
18
|
+
|
|
19
|
+
partition_hint = (
|
|
20
|
+
columns.get(partition_column) # type: ignore
|
|
21
|
+
or additional_hints.get(partition_column)
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
athena_adapter(
|
|
25
|
+
resource,
|
|
26
|
+
athena_partition.day(partition_column)
|
|
27
|
+
if partition_hint
|
|
28
|
+
and partition_hint.get("data_type") in ("timestamp", "date")
|
|
29
|
+
else partition_column,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
resource.for_each(source, _apply_partition_hint)
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""Fetches Personio Employees, Absences, Attendances."""
|
|
2
|
+
|
|
3
|
+
from typing import Iterable, Optional
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from dlt.common import pendulum
|
|
7
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
8
|
+
from dlt.common.typing import TAnyDateTime, TDataItem
|
|
9
|
+
from dlt.sources import DltResource
|
|
10
|
+
|
|
11
|
+
from .helpers import PersonioAPI
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dlt.source(name="personio", max_table_nesting=0)
|
|
15
|
+
def personio_source(
|
|
16
|
+
start_date: TAnyDateTime,
|
|
17
|
+
end_date: Optional[TAnyDateTime] = None,
|
|
18
|
+
client_id: str = dlt.secrets.value,
|
|
19
|
+
client_secret: str = dlt.secrets.value,
|
|
20
|
+
items_per_page: int = 200,
|
|
21
|
+
) -> Iterable[DltResource]:
|
|
22
|
+
"""
|
|
23
|
+
The source for the Personio pipeline. Available resources are employees, absences, and attendances.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
client_id: The client ID of your app.
|
|
27
|
+
client_secret: The client secret of your app.
|
|
28
|
+
items_per_page: The max number of items to fetch per page. Defaults to 200.
|
|
29
|
+
Returns:
|
|
30
|
+
Iterable: A list of DltResource objects representing the data resources.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
client = PersonioAPI(client_id, client_secret)
|
|
34
|
+
|
|
35
|
+
@dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
|
|
36
|
+
def employees(
|
|
37
|
+
updated_at: dlt.sources.incremental[
|
|
38
|
+
pendulum.DateTime
|
|
39
|
+
] = dlt.sources.incremental(
|
|
40
|
+
"last_modified_at", initial_value=None, allow_external_schedulers=True
|
|
41
|
+
),
|
|
42
|
+
items_per_page: int = items_per_page,
|
|
43
|
+
) -> Iterable[TDataItem]:
|
|
44
|
+
"""
|
|
45
|
+
The resource for employees, supports incremental loading and pagination.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
updated_at: The saved state of the last 'last_modified_at' value.
|
|
49
|
+
items_per_page: The max number of items to fetch per page. Defaults to 200.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Iterable: A generator of employees.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def convert_item(item: TDataItem) -> TDataItem:
|
|
56
|
+
"""Converts an employee item."""
|
|
57
|
+
attributes = item.get("attributes", {})
|
|
58
|
+
output = {}
|
|
59
|
+
for value in attributes.values():
|
|
60
|
+
name = value["universal_id"]
|
|
61
|
+
if not name:
|
|
62
|
+
label: str = value["label"].replace(" ", "_")
|
|
63
|
+
name = label.lower()
|
|
64
|
+
|
|
65
|
+
if value["type"] == "date" and value["value"]:
|
|
66
|
+
output[name] = ensure_pendulum_datetime(value["value"])
|
|
67
|
+
else:
|
|
68
|
+
output[name] = value["value"]
|
|
69
|
+
return output
|
|
70
|
+
|
|
71
|
+
if updated_at.last_value:
|
|
72
|
+
last_value = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
|
|
73
|
+
else:
|
|
74
|
+
last_value = None
|
|
75
|
+
|
|
76
|
+
params = {"limit": items_per_page, "updated_since": last_value}
|
|
77
|
+
|
|
78
|
+
pages = client.get_pages("company/employees", params=params)
|
|
79
|
+
for page in pages:
|
|
80
|
+
yield [convert_item(item) for item in page]
|
|
81
|
+
|
|
82
|
+
@dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
|
|
83
|
+
def absence_types(items_per_page: int = items_per_page) -> Iterable[TDataItem]:
|
|
84
|
+
"""
|
|
85
|
+
The resource for absence types (time-off-types), supports pagination.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
items_per_page: The max number of items to fetch per page. Defaults to 200.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Iterable: A generator of absences.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
pages = client.get_pages(
|
|
95
|
+
"company/time-off-types", params={"limit": items_per_page}
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
for page in pages:
|
|
99
|
+
yield [item.get("attributes", {}) for item in page]
|
|
100
|
+
|
|
101
|
+
@dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
|
|
102
|
+
def absences(
|
|
103
|
+
updated_at: dlt.sources.incremental[
|
|
104
|
+
pendulum.DateTime
|
|
105
|
+
] = dlt.sources.incremental(
|
|
106
|
+
"updated_at", initial_value=None, allow_external_schedulers=True
|
|
107
|
+
),
|
|
108
|
+
items_per_page: int = items_per_page,
|
|
109
|
+
) -> Iterable[TDataItem]:
|
|
110
|
+
"""
|
|
111
|
+
The resource for absence (time-offs), supports incremental loading and pagination.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
updated_at: The saved state of the last 'updated_at' value.
|
|
115
|
+
items_per_page: The max number of items to fetch per page. Defaults to 200.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Iterable: A generator of absences.
|
|
119
|
+
"""
|
|
120
|
+
if updated_at.last_value:
|
|
121
|
+
updated_iso = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
|
|
122
|
+
else:
|
|
123
|
+
updated_iso = None
|
|
124
|
+
|
|
125
|
+
params = {
|
|
126
|
+
"limit": items_per_page,
|
|
127
|
+
"updated_since": updated_iso,
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
def convert_item(item: TDataItem) -> TDataItem:
|
|
131
|
+
output = item.get("attributes", {})
|
|
132
|
+
output["created_at"] = ensure_pendulum_datetime(output["created_at"])
|
|
133
|
+
output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
|
|
134
|
+
return output
|
|
135
|
+
|
|
136
|
+
pages = client.get_pages(
|
|
137
|
+
"company/time-offs",
|
|
138
|
+
params=params,
|
|
139
|
+
offset_by_page=True,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
for page in pages:
|
|
143
|
+
yield [convert_item(item) for item in page]
|
|
144
|
+
|
|
145
|
+
@dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
|
|
146
|
+
def attendances(
|
|
147
|
+
start_date: TAnyDateTime = start_date,
|
|
148
|
+
end_date: Optional[TAnyDateTime] = end_date,
|
|
149
|
+
updated_at: dlt.sources.incremental[
|
|
150
|
+
pendulum.DateTime
|
|
151
|
+
] = dlt.sources.incremental(
|
|
152
|
+
"updated_at", initial_value=None, allow_external_schedulers=True
|
|
153
|
+
),
|
|
154
|
+
items_per_page: int = items_per_page,
|
|
155
|
+
) -> Iterable[TDataItem]:
|
|
156
|
+
"""
|
|
157
|
+
The resource for attendances, supports incremental loading and pagination.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
start_date: The start date to fetch attendances from.
|
|
161
|
+
end_date: The end date to fetch attendances from. Defaults to now.
|
|
162
|
+
updated_at: The saved state of the last 'updated_at' value.
|
|
163
|
+
items_per_page: The max number of items to fetch per page. Defaults to 200.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Iterable: A generator of attendances.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
end_date = end_date or pendulum.now()
|
|
170
|
+
if updated_at.last_value:
|
|
171
|
+
updated_iso = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
|
|
172
|
+
else:
|
|
173
|
+
updated_iso = None
|
|
174
|
+
|
|
175
|
+
params = {
|
|
176
|
+
"limit": items_per_page,
|
|
177
|
+
"start_date": ensure_pendulum_datetime(start_date).to_date_string(),
|
|
178
|
+
"end_date": ensure_pendulum_datetime(end_date).to_date_string(),
|
|
179
|
+
"updated_from": updated_iso,
|
|
180
|
+
"includePending": True,
|
|
181
|
+
}
|
|
182
|
+
pages = client.get_pages(
|
|
183
|
+
"company/attendances",
|
|
184
|
+
params=params,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def convert_item(item: TDataItem) -> TDataItem:
|
|
188
|
+
"""Converts an attendance item."""
|
|
189
|
+
output = dict(id=item["id"], **item.get("attributes"))
|
|
190
|
+
output["date"] = ensure_pendulum_datetime(output["date"]).date()
|
|
191
|
+
output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
|
|
192
|
+
return output
|
|
193
|
+
|
|
194
|
+
for page in pages:
|
|
195
|
+
yield [convert_item(item) for item in page]
|
|
196
|
+
|
|
197
|
+
@dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
|
|
198
|
+
def projects() -> Iterable[TDataItem]:
|
|
199
|
+
"""
|
|
200
|
+
The resource for projects.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
Iterable: A generator of projects.
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
pages = client.get_pages("company/attendances/projects")
|
|
207
|
+
|
|
208
|
+
def convert_item(item: TDataItem) -> TDataItem:
|
|
209
|
+
"""Converts an attendance item."""
|
|
210
|
+
output = dict(id=item["id"], **item.get("attributes"))
|
|
211
|
+
output["created_at"] = ensure_pendulum_datetime(output["created_at"])
|
|
212
|
+
output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
|
|
213
|
+
return output
|
|
214
|
+
|
|
215
|
+
for page in pages:
|
|
216
|
+
yield [convert_item(item) for item in page]
|
|
217
|
+
|
|
218
|
+
@dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
|
|
219
|
+
def document_categories() -> Iterable[TDataItem]:
|
|
220
|
+
"""
|
|
221
|
+
The resource for document_categories.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Iterable: A generator of document_categories.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
pages = client.get_pages("company/document-categories")
|
|
228
|
+
|
|
229
|
+
def convert_item(item: TDataItem) -> TDataItem:
|
|
230
|
+
"""Converts an document_categories item."""
|
|
231
|
+
output = dict(id=item["id"], **item.get("attributes"))
|
|
232
|
+
return output
|
|
233
|
+
|
|
234
|
+
for page in pages:
|
|
235
|
+
yield [convert_item(item) for item in page]
|
|
236
|
+
|
|
237
|
+
@dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
|
|
238
|
+
def custom_reports_list() -> Iterable[TDataItem]:
|
|
239
|
+
"""
|
|
240
|
+
The resource for custom_reports.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Iterable: A generator of custom_reports.
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
pages = client.get_pages("company/custom-reports/reports")
|
|
247
|
+
|
|
248
|
+
for page in pages:
|
|
249
|
+
yield [item.get("attributes", {}) for item in page]
|
|
250
|
+
|
|
251
|
+
@dlt.transformer(
|
|
252
|
+
data_from=employees,
|
|
253
|
+
write_disposition="merge",
|
|
254
|
+
primary_key=["employee_id", "id"],
|
|
255
|
+
)
|
|
256
|
+
@dlt.defer
|
|
257
|
+
def employees_absences_balance(employees_item: TDataItem) -> Iterable[TDataItem]:
|
|
258
|
+
"""
|
|
259
|
+
The transformer for employees_absences_balance.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
employees_item: The employee data.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Iterable: A generator of employees_absences_balance for each employee.
|
|
266
|
+
"""
|
|
267
|
+
for employee in employees_item:
|
|
268
|
+
employee_id = employee["id"]
|
|
269
|
+
pages = client.get_pages(
|
|
270
|
+
f"company/employees/{employee_id}/absences/balance",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
for page in pages:
|
|
274
|
+
yield [dict(employee_id=employee_id, **i) for i in page]
|
|
275
|
+
|
|
276
|
+
@dlt.transformer(
|
|
277
|
+
data_from=custom_reports_list,
|
|
278
|
+
write_disposition="merge",
|
|
279
|
+
primary_key=["report_id", "item_id"],
|
|
280
|
+
)
|
|
281
|
+
@dlt.defer
|
|
282
|
+
def custom_reports(
|
|
283
|
+
custom_reports_item: TDataItem, items_per_page: int = items_per_page
|
|
284
|
+
) -> Iterable[TDataItem]:
|
|
285
|
+
"""
|
|
286
|
+
The transformer for custom reports, supports pagination.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
custom_reports_item: The custom_report data.
|
|
290
|
+
items_per_page: The max number of items to fetch per page. Defaults to 200.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Iterable: A generator of employees_absences_balance for each employee.
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
def convert_item(item: TDataItem, report_id: str) -> TDataItem:
|
|
297
|
+
"""Converts an employee item."""
|
|
298
|
+
attributes = item.pop("attributes")
|
|
299
|
+
output = dict(report_id=report_id, item_id=list(item.values())[0])
|
|
300
|
+
for value in attributes:
|
|
301
|
+
name = value["attribute_id"]
|
|
302
|
+
if value["data_type"] == "date" and value["value"]:
|
|
303
|
+
output[name] = ensure_pendulum_datetime(value["value"])
|
|
304
|
+
else:
|
|
305
|
+
output[name] = value["value"]
|
|
306
|
+
return output
|
|
307
|
+
|
|
308
|
+
for custom_report in custom_reports_item:
|
|
309
|
+
report_id = custom_report["id"]
|
|
310
|
+
pages = client.get_pages(
|
|
311
|
+
f"company/custom-reports/reports/{report_id}",
|
|
312
|
+
params={"limit": items_per_page},
|
|
313
|
+
offset_by_page=True,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
for page in pages:
|
|
317
|
+
for report in page:
|
|
318
|
+
report_items = report.get("attributes", {}).get("items", [])
|
|
319
|
+
yield [convert_item(item, report_id) for item in report_items]
|
|
320
|
+
|
|
321
|
+
return (
|
|
322
|
+
employees,
|
|
323
|
+
absence_types,
|
|
324
|
+
absences,
|
|
325
|
+
attendances,
|
|
326
|
+
projects,
|
|
327
|
+
document_categories,
|
|
328
|
+
employees_absences_balance,
|
|
329
|
+
custom_reports_list,
|
|
330
|
+
custom_reports,
|
|
331
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Personio source helpers"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Iterable, Optional
|
|
4
|
+
from urllib.parse import urljoin
|
|
5
|
+
|
|
6
|
+
from dlt.common.typing import Dict, TDataItems
|
|
7
|
+
from dlt.sources.helpers import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PersonioAPI:
|
|
11
|
+
"""A Personio API client."""
|
|
12
|
+
|
|
13
|
+
base_url = "https://api.personio.de/v1/"
|
|
14
|
+
|
|
15
|
+
def __init__(self, client_id: str, client_secret: str) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Args:
|
|
18
|
+
client_id: The client ID of your app.
|
|
19
|
+
client_secret: The client secret of your app.
|
|
20
|
+
"""
|
|
21
|
+
self.client_id = client_id
|
|
22
|
+
self.client_secret = client_secret
|
|
23
|
+
self.access_token = self.get_token()
|
|
24
|
+
|
|
25
|
+
def get_token(self) -> str:
|
|
26
|
+
"""Get an access token from Personio.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
The access token.
|
|
30
|
+
"""
|
|
31
|
+
headers = {"Content-Type": "application/json", "Accept": "application/json"}
|
|
32
|
+
data = {"client_id": self.client_id, "client_secret": self.client_secret}
|
|
33
|
+
url = urljoin(self.base_url, "auth")
|
|
34
|
+
response = requests.request("POST", url, headers=headers, json=data)
|
|
35
|
+
json_response = response.json()
|
|
36
|
+
token: str = json_response["data"]["token"]
|
|
37
|
+
return token
|
|
38
|
+
|
|
39
|
+
def get_pages(
|
|
40
|
+
self,
|
|
41
|
+
resource: str,
|
|
42
|
+
params: Optional[Dict[str, Any]] = None,
|
|
43
|
+
offset_by_page: bool = False,
|
|
44
|
+
) -> Iterable[TDataItems]:
|
|
45
|
+
"""Get all pages from Personio using requests.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
resource: The resource to get pages for (e.g. employees, absences, attendances).
|
|
49
|
+
params: The parameters for the resource.
|
|
50
|
+
offset_by_page (bool): If True, offset increases by 1 per page; else, increases by page_size.
|
|
51
|
+
|
|
52
|
+
Yields:
|
|
53
|
+
List of data items from the page
|
|
54
|
+
"""
|
|
55
|
+
params = params or {}
|
|
56
|
+
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
57
|
+
params.update({"offset": int(offset_by_page), "page": int(offset_by_page)})
|
|
58
|
+
url = urljoin(self.base_url, resource)
|
|
59
|
+
starts_from_zero = False
|
|
60
|
+
while True:
|
|
61
|
+
response = requests.get(url, headers=headers, params=params)
|
|
62
|
+
json_response = response.json()
|
|
63
|
+
# Get an item list from the page
|
|
64
|
+
yield json_response["data"]
|
|
65
|
+
|
|
66
|
+
metadata = json_response.get("metadata")
|
|
67
|
+
if not metadata:
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
total_pages = metadata.get("total_pages")
|
|
71
|
+
current_page = metadata.get("current_page")
|
|
72
|
+
if current_page == 0:
|
|
73
|
+
starts_from_zero = True
|
|
74
|
+
|
|
75
|
+
if (
|
|
76
|
+
current_page >= (total_pages - int(starts_from_zero))
|
|
77
|
+
or not json_response["data"]
|
|
78
|
+
):
|
|
79
|
+
break
|
|
80
|
+
|
|
81
|
+
if offset_by_page:
|
|
82
|
+
params["offset"] += 1
|
|
83
|
+
params["page"] += 1
|
|
84
|
+
else:
|
|
85
|
+
params["offset"] += params["limit"]
|
|
86
|
+
params["page"] += 1
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from typing import Iterable, Optional
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
import pendulum
|
|
5
|
+
import requests
|
|
6
|
+
from dlt.common.typing import TAnyDateTime, TDataItem
|
|
7
|
+
from dlt.sources import DltResource
|
|
8
|
+
from dlt.sources.helpers.requests import Client
|
|
9
|
+
|
|
10
|
+
from ingestr.src.phantombuster.client import PhantombusterClient
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def retry_on_limit(
|
|
14
|
+
response: Optional[requests.Response], exception: Optional[BaseException]
|
|
15
|
+
) -> bool:
|
|
16
|
+
if response is not None and response.status_code == 429:
|
|
17
|
+
return True
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def create_client() -> requests.Session:
|
|
22
|
+
return Client(
|
|
23
|
+
raise_for_status=False,
|
|
24
|
+
retry_condition=retry_on_limit,
|
|
25
|
+
request_max_attempts=12,
|
|
26
|
+
request_backoff_factor=2,
|
|
27
|
+
).session
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dlt.source(max_table_nesting=0)
|
|
31
|
+
def phantombuster_source(
|
|
32
|
+
api_key: str, agent_id: str, start_date: TAnyDateTime, end_date: TAnyDateTime | None
|
|
33
|
+
) -> Iterable[DltResource]:
|
|
34
|
+
client = PhantombusterClient(api_key)
|
|
35
|
+
|
|
36
|
+
@dlt.resource(
|
|
37
|
+
write_disposition="merge",
|
|
38
|
+
primary_key="container_id",
|
|
39
|
+
columns={
|
|
40
|
+
"partition_dt": {"data_type": "date", "partition": True},
|
|
41
|
+
},
|
|
42
|
+
)
|
|
43
|
+
def completed_phantoms(
|
|
44
|
+
dateTime=(
|
|
45
|
+
dlt.sources.incremental(
|
|
46
|
+
"ended_at",
|
|
47
|
+
initial_value=start_date,
|
|
48
|
+
end_value=end_date,
|
|
49
|
+
range_start="closed",
|
|
50
|
+
range_end="closed",
|
|
51
|
+
)
|
|
52
|
+
),
|
|
53
|
+
) -> Iterable[TDataItem]:
|
|
54
|
+
if dateTime.end_value is None:
|
|
55
|
+
end_dt = pendulum.now(tz="UTC")
|
|
56
|
+
else:
|
|
57
|
+
end_dt = dateTime.end_value
|
|
58
|
+
|
|
59
|
+
start_dt = dateTime.last_value
|
|
60
|
+
|
|
61
|
+
yield client.fetch_containers_result(
|
|
62
|
+
create_client(), agent_id, start_date=start_dt, end_date=end_dt
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
return completed_phantoms
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
import pendulum
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PhantombusterClient:
|
|
8
|
+
def __init__(self, api_key: str):
|
|
9
|
+
self.api_key = api_key
|
|
10
|
+
|
|
11
|
+
def _get_headers(self):
|
|
12
|
+
return {
|
|
13
|
+
"X-Phantombuster-Key-1": self.api_key,
|
|
14
|
+
"accept": "application/json",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
def fetch_containers_result(
|
|
18
|
+
self,
|
|
19
|
+
session: requests.Session,
|
|
20
|
+
agent_id: str,
|
|
21
|
+
start_date: pendulum.DateTime,
|
|
22
|
+
end_date: pendulum.DateTime,
|
|
23
|
+
):
|
|
24
|
+
url = "https://api.phantombuster.com/api/v2/containers/fetch-all/"
|
|
25
|
+
before_ended_at = None
|
|
26
|
+
limit = 100
|
|
27
|
+
|
|
28
|
+
started_at = start_date.int_timestamp * 1000 + int(
|
|
29
|
+
start_date.microsecond / 1000
|
|
30
|
+
)
|
|
31
|
+
ended_at = end_date.int_timestamp * 1000 + int(end_date.microsecond / 1000)
|
|
32
|
+
|
|
33
|
+
while True:
|
|
34
|
+
params: dict[str, Union[str, int, float, bytes, None]] = {
|
|
35
|
+
"agentId": agent_id,
|
|
36
|
+
"limit": limit,
|
|
37
|
+
"mode": "finalized",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if before_ended_at:
|
|
41
|
+
params["beforeEndedAt"] = before_ended_at
|
|
42
|
+
|
|
43
|
+
response = session.get(url=url, headers=self._get_headers(), params=params)
|
|
44
|
+
data = response.json()
|
|
45
|
+
containers = data.get("containers", [])
|
|
46
|
+
|
|
47
|
+
for container in containers:
|
|
48
|
+
container_ended_at = container.get("endedAt")
|
|
49
|
+
|
|
50
|
+
if before_ended_at is None or before_ended_at > container_ended_at:
|
|
51
|
+
before_ended_at = container_ended_at
|
|
52
|
+
|
|
53
|
+
if container_ended_at < started_at or container_ended_at > ended_at:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
result = self.fetch_result_object(session, container["id"])
|
|
58
|
+
partition_dt = pendulum.from_timestamp(
|
|
59
|
+
container_ended_at / 1000, tz="UTC"
|
|
60
|
+
).date()
|
|
61
|
+
container_ended_at_datetime = pendulum.from_timestamp(
|
|
62
|
+
container_ended_at / 1000, tz="UTC"
|
|
63
|
+
)
|
|
64
|
+
row = {
|
|
65
|
+
"container_id": container["id"],
|
|
66
|
+
"container": container,
|
|
67
|
+
"result": result,
|
|
68
|
+
"partition_dt": partition_dt,
|
|
69
|
+
"ended_at": container_ended_at_datetime,
|
|
70
|
+
}
|
|
71
|
+
yield row
|
|
72
|
+
|
|
73
|
+
except requests.RequestException as e:
|
|
74
|
+
print(f"Error fetching result for container {container['id']}: {e}")
|
|
75
|
+
|
|
76
|
+
if data["maxLimitReached"] is False:
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
def fetch_result_object(self, session: requests.Session, container_id: str):
|
|
80
|
+
result_url = (
|
|
81
|
+
"https://api.phantombuster.com/api/v2/containers/fetch-result-object"
|
|
82
|
+
)
|
|
83
|
+
params = {"id": container_id}
|
|
84
|
+
response = session.get(result_url, headers=self._get_headers(), params=params)
|
|
85
|
+
response.raise_for_status()
|
|
86
|
+
|
|
87
|
+
return response.json()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Iterable
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
import pendulum
|
|
5
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
6
|
+
from dlt.common.typing import TDataItem
|
|
7
|
+
from dlt.sources import DltResource
|
|
8
|
+
from dlt.sources.helpers import requests
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dlt.source(name="pinterest", max_table_nesting=0)
|
|
12
|
+
def pinterest_source(
|
|
13
|
+
start_date: pendulum.DateTime,
|
|
14
|
+
access_token: str,
|
|
15
|
+
page_size: int = 200,
|
|
16
|
+
end_date: pendulum.DateTime | None = None,
|
|
17
|
+
) -> Iterable[DltResource]:
|
|
18
|
+
session = requests.Session()
|
|
19
|
+
session.headers.update({"Authorization": f"Bearer {access_token}"})
|
|
20
|
+
base_url = "https://api.pinterest.com/v5"
|
|
21
|
+
|
|
22
|
+
def fetch_data(
|
|
23
|
+
endpoint: str,
|
|
24
|
+
start_dt: pendulum.DateTime,
|
|
25
|
+
end_dt: pendulum.DateTime,
|
|
26
|
+
) -> Iterable[TDataItem]:
|
|
27
|
+
url = f"{base_url}/{endpoint}"
|
|
28
|
+
params = {"page_size": page_size}
|
|
29
|
+
bookmark = None
|
|
30
|
+
while True:
|
|
31
|
+
if bookmark:
|
|
32
|
+
params["bookmark"] = bookmark
|
|
33
|
+
|
|
34
|
+
resp = session.get(url, params=params)
|
|
35
|
+
resp.raise_for_status()
|
|
36
|
+
data = resp.json()
|
|
37
|
+
items = data.get("items") or []
|
|
38
|
+
|
|
39
|
+
for item in items:
|
|
40
|
+
item_created = ensure_pendulum_datetime(item["created_at"])
|
|
41
|
+
if item_created <= start_dt:
|
|
42
|
+
continue
|
|
43
|
+
if item_created > end_dt:
|
|
44
|
+
continue
|
|
45
|
+
item["created_at"] = item_created
|
|
46
|
+
yield item
|
|
47
|
+
|
|
48
|
+
bookmark = data.get("bookmark")
|
|
49
|
+
if not bookmark:
|
|
50
|
+
break
|
|
51
|
+
|
|
52
|
+
@dlt.resource(write_disposition="merge", primary_key="id")
|
|
53
|
+
def pins(
|
|
54
|
+
datetime=dlt.sources.incremental(
|
|
55
|
+
"created_at",
|
|
56
|
+
initial_value=start_date,
|
|
57
|
+
end_value=end_date,
|
|
58
|
+
),
|
|
59
|
+
) -> Iterable[TDataItem]:
|
|
60
|
+
_start_date = datetime.last_value or start_date
|
|
61
|
+
if end_date is None:
|
|
62
|
+
_end_date = pendulum.now("UTC")
|
|
63
|
+
else:
|
|
64
|
+
_end_date = datetime.end_value
|
|
65
|
+
yield from fetch_data("pins", _start_date, _end_date)
|
|
66
|
+
|
|
67
|
+
@dlt.resource(write_disposition="merge", primary_key="id")
|
|
68
|
+
def boards(
|
|
69
|
+
datetime=dlt.sources.incremental(
|
|
70
|
+
"created_at",
|
|
71
|
+
initial_value=start_date,
|
|
72
|
+
end_value=end_date,
|
|
73
|
+
),
|
|
74
|
+
) -> Iterable[TDataItem]:
|
|
75
|
+
_start_date = datetime.last_value or start_date
|
|
76
|
+
if end_date is None:
|
|
77
|
+
_end_date = pendulum.now("UTC")
|
|
78
|
+
else:
|
|
79
|
+
_end_date = datetime.end_value
|
|
80
|
+
yield from fetch_data("boards", _start_date, _end_date)
|
|
81
|
+
|
|
82
|
+
return pins, boards
|