ingestr 0.13.9__py3-none-any.whl → 0.13.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -1,10 +1,10 @@
1
- from datetime import datetime, timezone, timedelta
1
+ from datetime import datetime, timedelta, timezone
2
2
  from enum import Enum
3
3
  from typing import Dict, List, Optional
4
- from requests import Response
5
4
 
6
5
  import dlt
7
6
  from dlt.sources.rest_api import EndpointResource, RESTAPIConfig, rest_api_resources
7
+ from requests import Response
8
8
 
9
9
 
10
10
  class InvalidCustomReportError(Exception):
@@ -13,9 +13,11 @@ class InvalidCustomReportError(Exception):
13
13
  "Custom report should be in the format 'custom:{endpoint}:{report_type}:{dimensions}"
14
14
  )
15
15
 
16
+
16
17
  class ClientError(Exception):
17
18
  pass
18
19
 
20
+
19
21
  TYPE_HINTS = {
20
22
  "application_is_hidden": {"data_type": "bool"},
21
23
  "average_cpa": {"data_type": "double"},
@@ -119,7 +121,6 @@ def applovin_source(
119
121
  end_date: Optional[str],
120
122
  custom: Optional[str],
121
123
  ):
122
-
123
124
  backfill = False
124
125
  if end_date is None:
125
126
  backfill = True
@@ -127,7 +128,7 @@ def applovin_source(
127
128
  # use the greatest of yesterday and start_date
128
129
  end_date = max(
129
130
  datetime.now(timezone.utc) - timedelta(days=1),
130
- datetime.fromisoformat(start_date).replace(tzinfo=timezone.utc)
131
+ datetime.fromisoformat(start_date).replace(tzinfo=timezone.utc),
131
132
  ).strftime("%Y-%m-%d")
132
133
 
133
134
  config: RESTAPIConfig = {
@@ -157,7 +158,7 @@ def applovin_source(
157
158
  "paginator": "single_page",
158
159
  "response_actions": [
159
160
  http_error_handler,
160
- ]
161
+ ],
161
162
  },
162
163
  },
163
164
  "resources": [
@@ -177,8 +178,7 @@ def applovin_source(
177
178
  "advertiser-probabilistic-report",
178
179
  "probabilisticReport",
179
180
  exclude(
180
- REPORT_SCHEMA[ReportType.ADVERTISER],
181
- PROBABILISTIC_REPORT_EXCLUDE
181
+ REPORT_SCHEMA[ReportType.ADVERTISER], PROBABILISTIC_REPORT_EXCLUDE
182
182
  ),
183
183
  ReportType.ADVERTISER,
184
184
  ),
@@ -256,6 +256,7 @@ def exclude(source: List[str], exclude_list: List[str]) -> List[str]:
256
256
  def build_type_hints(cols: List[str]) -> dict:
257
257
  return {col: TYPE_HINTS[col] for col in cols if col in TYPE_HINTS}
258
258
 
259
+
259
260
  def http_error_handler(resp: Response):
260
261
  if not resp.ok:
261
262
  raise ClientError(f"HTTP Status {resp.status_code}: {resp.text}")
ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.9"
1
+ version = "v0.13.11"
ingestr/src/factory.py CHANGED
@@ -42,12 +42,14 @@ from ingestr.src.sources import (
42
42
  MongoDbSource,
43
43
  NotionSource,
44
44
  S3Source,
45
+ SalesforceSource,
45
46
  ShopifySource,
46
47
  SlackSource,
47
48
  SqlSource,
48
49
  StripeAnalyticsSource,
49
50
  TikTokSource,
50
51
  ZendeskSource,
52
+ PersonioSource,
51
53
  )
52
54
 
53
55
  SQL_SOURCE_SCHEMES = [
@@ -136,6 +138,8 @@ class SourceDestinationFactory:
136
138
  "linkedinads": LinkedInAdsSource,
137
139
  "applovin": AppLovinSource,
138
140
  "applovinmax": ApplovinMaxSource,
141
+ "salesforce": SalesforceSource,
142
+ "personio": PersonioSource,
139
143
  }
140
144
  destinations: Dict[str, Type[DestinationProtocol]] = {
141
145
  "bigquery": BigQueryDestination,
@@ -0,0 +1,331 @@
1
+ """Fetches Personio Employees, Absences, Attendances."""
2
+
3
+ from typing import Iterable, Optional
4
+
5
+ import dlt
6
+ from dlt.common import pendulum
7
+ from dlt.common.time import ensure_pendulum_datetime
8
+ from dlt.common.typing import TAnyDateTime, TDataItem
9
+ from dlt.sources import DltResource
10
+
11
+ from .helpers import PersonioAPI
12
+
13
+
14
+ @dlt.source(name="personio", max_table_nesting=0)
15
+ def personio_source(
16
+ start_date: TAnyDateTime,
17
+ end_date: Optional[TAnyDateTime] = None,
18
+ client_id: str = dlt.secrets.value,
19
+ client_secret: str = dlt.secrets.value,
20
+ items_per_page: int = 200,
21
+ ) -> Iterable[DltResource]:
22
+ """
23
+ The source for the Personio pipeline. Available resources are employees, absences, and attendances.
24
+
25
+ Args:
26
+ client_id: The client ID of your app.
27
+ client_secret: The client secret of your app.
28
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
29
+ Returns:
30
+ Iterable: A list of DltResource objects representing the data resources.
31
+ """
32
+
33
+ client = PersonioAPI(client_id, client_secret)
34
+
35
+ @dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
36
+ def employees(
37
+ updated_at: dlt.sources.incremental[
38
+ pendulum.DateTime
39
+ ] = dlt.sources.incremental(
40
+ "last_modified_at", initial_value=None, allow_external_schedulers=True
41
+ ),
42
+ items_per_page: int = items_per_page,
43
+ ) -> Iterable[TDataItem]:
44
+ """
45
+ The resource for employees, supports incremental loading and pagination.
46
+
47
+ Args:
48
+ updated_at: The saved state of the last 'last_modified_at' value.
49
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
50
+
51
+ Returns:
52
+ Iterable: A generator of employees.
53
+ """
54
+
55
+ def convert_item(item: TDataItem) -> TDataItem:
56
+ """Converts an employee item."""
57
+ attributes = item.get("attributes", {})
58
+ output = {}
59
+ for value in attributes.values():
60
+ name = value["universal_id"]
61
+ if not name:
62
+ label: str = value["label"].replace(" ", "_")
63
+ name = label.lower()
64
+
65
+ if value["type"] == "date" and value["value"]:
66
+ output[name] = ensure_pendulum_datetime(value["value"])
67
+ else:
68
+ output[name] = value["value"]
69
+ return output
70
+
71
+ if updated_at.last_value:
72
+ last_value = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
73
+ else:
74
+ last_value = None
75
+
76
+ params = {"limit": items_per_page, "updated_since": last_value}
77
+
78
+ pages = client.get_pages("company/employees", params=params)
79
+ for page in pages:
80
+ yield [convert_item(item) for item in page]
81
+
82
+ @dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
83
+ def absence_types(items_per_page: int = items_per_page) -> Iterable[TDataItem]:
84
+ """
85
+ The resource for absence types (time-off-types), supports pagination.
86
+
87
+ Args:
88
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
89
+
90
+ Returns:
91
+ Iterable: A generator of absences.
92
+ """
93
+
94
+ pages = client.get_pages(
95
+ "company/time-off-types", params={"limit": items_per_page}
96
+ )
97
+
98
+ for page in pages:
99
+ yield [item.get("attributes", {}) for item in page]
100
+
101
+ @dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
102
+ def absences(
103
+ updated_at: dlt.sources.incremental[
104
+ pendulum.DateTime
105
+ ] = dlt.sources.incremental(
106
+ "updated_at", initial_value=None, allow_external_schedulers=True
107
+ ),
108
+ items_per_page: int = items_per_page,
109
+ ) -> Iterable[TDataItem]:
110
+ """
111
+ The resource for absence (time-offs), supports incremental loading and pagination.
112
+
113
+ Args:
114
+ updated_at: The saved state of the last 'updated_at' value.
115
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
116
+
117
+ Returns:
118
+ Iterable: A generator of absences.
119
+ """
120
+ if updated_at.last_value:
121
+ updated_iso = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
122
+ else:
123
+ updated_iso = None
124
+
125
+ params = {
126
+ "limit": items_per_page,
127
+ "updated_since": updated_iso,
128
+ }
129
+
130
+ def convert_item(item: TDataItem) -> TDataItem:
131
+ output = item.get("attributes", {})
132
+ output["created_at"] = ensure_pendulum_datetime(output["created_at"])
133
+ output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
134
+ return output
135
+
136
+ pages = client.get_pages(
137
+ "company/time-offs",
138
+ params=params,
139
+ offset_by_page=True,
140
+ )
141
+
142
+ for page in pages:
143
+ yield [convert_item(item) for item in page]
144
+
145
+ @dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
146
+ def attendances(
147
+ start_date: TAnyDateTime = start_date,
148
+ end_date: Optional[TAnyDateTime] = end_date,
149
+ updated_at: dlt.sources.incremental[
150
+ pendulum.DateTime
151
+ ] = dlt.sources.incremental(
152
+ "updated_at", initial_value=None, allow_external_schedulers=True
153
+ ),
154
+ items_per_page: int = items_per_page,
155
+ ) -> Iterable[TDataItem]:
156
+ """
157
+ The resource for attendances, supports incremental loading and pagination.
158
+
159
+ Args:
160
+ start_date: The start date to fetch attendances from.
161
+ end_date: The end date to fetch attendances from. Defaults to now.
162
+ updated_at: The saved state of the last 'updated_at' value.
163
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
164
+
165
+ Returns:
166
+ Iterable: A generator of attendances.
167
+ """
168
+
169
+ end_date = end_date or pendulum.now()
170
+ if updated_at.last_value:
171
+ updated_iso = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
172
+ else:
173
+ updated_iso = None
174
+
175
+ params = {
176
+ "limit": items_per_page,
177
+ "start_date": ensure_pendulum_datetime(start_date).to_date_string(),
178
+ "end_date": ensure_pendulum_datetime(end_date).to_date_string(),
179
+ "updated_from": updated_iso,
180
+ "includePending": True,
181
+ }
182
+ pages = client.get_pages(
183
+ "company/attendances",
184
+ params=params,
185
+ )
186
+
187
+ def convert_item(item: TDataItem) -> TDataItem:
188
+ """Converts an attendance item."""
189
+ output = dict(id=item["id"], **item.get("attributes"))
190
+ output["date"] = ensure_pendulum_datetime(output["date"]).date()
191
+ output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
192
+ return output
193
+
194
+ for page in pages:
195
+ yield [convert_item(item) for item in page]
196
+
197
+ @dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
198
+ def projects() -> Iterable[TDataItem]:
199
+ """
200
+ The resource for projects.
201
+
202
+ Returns:
203
+ Iterable: A generator of projects.
204
+ """
205
+
206
+ pages = client.get_pages("company/attendances/projects")
207
+
208
+ def convert_item(item: TDataItem) -> TDataItem:
209
+ """Converts an attendance item."""
210
+ output = dict(id=item["id"], **item.get("attributes"))
211
+ output["created_at"] = ensure_pendulum_datetime(output["created_at"])
212
+ output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
213
+ return output
214
+
215
+ for page in pages:
216
+ yield [convert_item(item) for item in page]
217
+
218
+ @dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
219
+ def document_categories() -> Iterable[TDataItem]:
220
+ """
221
+ The resource for document_categories.
222
+
223
+ Returns:
224
+ Iterable: A generator of document_categories.
225
+ """
226
+
227
+ pages = client.get_pages("company/document-categories")
228
+
229
+ def convert_item(item: TDataItem) -> TDataItem:
230
+ """Converts an document_categories item."""
231
+ output = dict(id=item["id"], **item.get("attributes"))
232
+ return output
233
+
234
+ for page in pages:
235
+ yield [convert_item(item) for item in page]
236
+
237
+ @dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
238
+ def custom_reports_list() -> Iterable[TDataItem]:
239
+ """
240
+ The resource for custom_reports.
241
+
242
+ Returns:
243
+ Iterable: A generator of custom_reports.
244
+ """
245
+
246
+ pages = client.get_pages("company/custom-reports/reports")
247
+
248
+ for page in pages:
249
+ yield [item.get("attributes", {}) for item in page]
250
+
251
+ @dlt.transformer(
252
+ data_from=employees,
253
+ write_disposition="merge",
254
+ primary_key=["employee_id", "id"],
255
+ )
256
+ @dlt.defer
257
+ def employees_absences_balance(employees_item: TDataItem) -> Iterable[TDataItem]:
258
+ """
259
+ The transformer for employees_absences_balance.
260
+
261
+ Args:
262
+ employees_item: The employee data.
263
+
264
+ Returns:
265
+ Iterable: A generator of employees_absences_balance for each employee.
266
+ """
267
+ for employee in employees_item:
268
+ employee_id = employee["id"]
269
+ pages = client.get_pages(
270
+ f"company/employees/{employee_id}/absences/balance",
271
+ )
272
+
273
+ for page in pages:
274
+ yield [dict(employee_id=employee_id, **i) for i in page]
275
+
276
+ @dlt.transformer(
277
+ data_from=custom_reports_list,
278
+ write_disposition="merge",
279
+ primary_key=["report_id", "item_id"],
280
+ )
281
+ @dlt.defer
282
+ def custom_reports(
283
+ custom_reports_item: TDataItem, items_per_page: int = items_per_page
284
+ ) -> Iterable[TDataItem]:
285
+ """
286
+ The transformer for custom reports, supports pagination.
287
+
288
+ Args:
289
+ custom_reports_item: The custom_report data.
290
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
291
+
292
+ Returns:
293
+ Iterable: A generator of employees_absences_balance for each employee.
294
+ """
295
+
296
+ def convert_item(item: TDataItem, report_id: str) -> TDataItem:
297
+ """Converts an employee item."""
298
+ attributes = item.pop("attributes")
299
+ output = dict(report_id=report_id, item_id=list(item.values())[0])
300
+ for value in attributes:
301
+ name = value["attribute_id"]
302
+ if value["data_type"] == "date" and value["value"]:
303
+ output[name] = ensure_pendulum_datetime(value["value"])
304
+ else:
305
+ output[name] = value["value"]
306
+ return output
307
+
308
+ for custom_report in custom_reports_item:
309
+ report_id = custom_report["id"]
310
+ pages = client.get_pages(
311
+ f"company/custom-reports/reports/{report_id}",
312
+ params={"limit": items_per_page},
313
+ offset_by_page=True,
314
+ )
315
+
316
+ for page in pages:
317
+ for report in page:
318
+ report_items = report.get("attributes", {}).get("items", [])
319
+ yield [convert_item(item, report_id) for item in report_items]
320
+
321
+ return (
322
+ employees,
323
+ absence_types,
324
+ absences,
325
+ attendances,
326
+ projects,
327
+ document_categories,
328
+ employees_absences_balance,
329
+ custom_reports_list,
330
+ custom_reports,
331
+ )
@@ -0,0 +1,85 @@
1
+ """Personio source helpers"""
2
+ from typing import Any, Iterable, Optional
3
+ from urllib.parse import urljoin
4
+
5
+ from dlt.common.typing import Dict, TDataItems
6
+ from dlt.sources.helpers import requests
7
+
8
+
9
+ class PersonioAPI:
10
+ """A Personio API client."""
11
+
12
+ base_url = "https://api.personio.de/v1/"
13
+
14
+ def __init__(self, client_id: str, client_secret: str) -> None:
15
+ """
16
+ Args:
17
+ client_id: The client ID of your app.
18
+ client_secret: The client secret of your app.
19
+ """
20
+ self.client_id = client_id
21
+ self.client_secret = client_secret
22
+ self.access_token = self.get_token()
23
+
24
+ def get_token(self) -> str:
25
+ """Get an access token from Personio.
26
+
27
+ Returns:
28
+ The access token.
29
+ """
30
+ headers = {"Content-Type": "application/json", "Accept": "application/json"}
31
+ data = {"client_id": self.client_id, "client_secret": self.client_secret}
32
+ url = urljoin(self.base_url, "auth")
33
+ response = requests.request("POST", url, headers=headers, json=data)
34
+ json_response = response.json()
35
+ token: str = json_response["data"]["token"]
36
+ return token
37
+
38
+ def get_pages(
39
+ self,
40
+ resource: str,
41
+ params: Optional[Dict[str, Any]] = None,
42
+ offset_by_page: bool = False,
43
+ ) -> Iterable[TDataItems]:
44
+ """Get all pages from Personio using requests.
45
+
46
+ Args:
47
+ resource: The resource to get pages for (e.g. employees, absences, attendances).
48
+ params: The parameters for the resource.
49
+ offset_by_page (bool): If True, offset increases by 1 per page; else, increases by page_size.
50
+
51
+ Yields:
52
+ List of data items from the page
53
+ """
54
+ params = params or {}
55
+ headers = {"Authorization": f"Bearer {self.access_token}"}
56
+ params.update({"offset": int(offset_by_page), "page": int(offset_by_page)})
57
+ url = urljoin(self.base_url, resource)
58
+ starts_from_zero = False
59
+ while True:
60
+ response = requests.get(url, headers=headers, params=params)
61
+ json_response = response.json()
62
+ # Get an item list from the page
63
+ yield json_response["data"]
64
+
65
+ metadata = json_response.get("metadata")
66
+ if not metadata:
67
+ break
68
+
69
+ total_pages = metadata.get("total_pages")
70
+ current_page = metadata.get("current_page")
71
+ if current_page == 0:
72
+ starts_from_zero = True
73
+
74
+ if (
75
+ current_page >= (total_pages - int(starts_from_zero))
76
+ or not json_response["data"]
77
+ ):
78
+ break
79
+
80
+ if offset_by_page:
81
+ params["offset"] += 1
82
+ params["page"] += 1
83
+ else:
84
+ params["offset"] += params["limit"]
85
+ params["page"] += 1
@@ -0,0 +1,149 @@
1
+ from typing import Iterable
2
+
3
+ import dlt
4
+ from dlt.common.typing import TDataItem
5
+ from dlt.sources import DltResource, incremental
6
+ from simple_salesforce import Salesforce
7
+
8
+ from .helpers import get_records
9
+
10
+
11
+ @dlt.source(name="salesforce")
12
+ def salesforce_source(
13
+ username: str,
14
+ password: str,
15
+ token: str,
16
+ ) -> Iterable[DltResource]:
17
+ """
18
+ Retrieves data from Salesforce using the Salesforce API.
19
+
20
+ Args:
21
+ username (str): The username for authentication.
22
+ password (str): The password for authentication.
23
+ token (str): The security token for authentication.
24
+
25
+ Yields:
26
+ DltResource: Data resources from Salesforce.
27
+ """
28
+
29
+ client = Salesforce(username, password, token)
30
+
31
+ # define resources
32
+ @dlt.resource(write_disposition="replace")
33
+ def user() -> Iterable[TDataItem]:
34
+ yield get_records(client, "User")
35
+
36
+ @dlt.resource(write_disposition="replace")
37
+ def user_role() -> Iterable[TDataItem]:
38
+ yield get_records(client, "UserRole")
39
+
40
+ @dlt.resource(write_disposition="merge")
41
+ def opportunity(
42
+ last_timestamp: incremental[str] = dlt.sources.incremental(
43
+ "SystemModstamp", initial_value=None
44
+ ),
45
+ ) -> Iterable[TDataItem]:
46
+ yield get_records(
47
+ client, "Opportunity", last_timestamp.last_value, "SystemModstamp"
48
+ )
49
+
50
+ @dlt.resource(write_disposition="merge")
51
+ def opportunity_line_item(
52
+ last_timestamp: incremental[str] = dlt.sources.incremental(
53
+ "SystemModstamp", initial_value=None
54
+ ),
55
+ ) -> Iterable[TDataItem]:
56
+ yield get_records(
57
+ client, "OpportunityLineItem", last_timestamp.last_value, "SystemModstamp"
58
+ )
59
+
60
+ @dlt.resource(write_disposition="merge")
61
+ def opportunity_contact_role(
62
+ last_timestamp: incremental[str] = dlt.sources.incremental(
63
+ "SystemModstamp", initial_value=None
64
+ ),
65
+ ) -> Iterable[TDataItem]:
66
+ yield get_records(
67
+ client,
68
+ "OpportunityContactRole",
69
+ last_timestamp.last_value,
70
+ "SystemModstamp",
71
+ )
72
+
73
+ @dlt.resource(write_disposition="merge")
74
+ def account(
75
+ last_timestamp: incremental[str] = dlt.sources.incremental(
76
+ "LastModifiedDate", initial_value=None
77
+ ),
78
+ ) -> Iterable[TDataItem]:
79
+ yield get_records(
80
+ client, "Account", last_timestamp.last_value, "LastModifiedDate"
81
+ )
82
+
83
+ @dlt.resource(write_disposition="replace")
84
+ def contact() -> Iterable[TDataItem]:
85
+ yield get_records(client, "Contact")
86
+
87
+ @dlt.resource(write_disposition="replace")
88
+ def lead() -> Iterable[TDataItem]:
89
+ yield get_records(client, "Lead")
90
+
91
+ @dlt.resource(write_disposition="replace")
92
+ def campaign() -> Iterable[TDataItem]:
93
+ yield get_records(client, "Campaign")
94
+
95
+ @dlt.resource(write_disposition="merge")
96
+ def campaign_member(
97
+ last_timestamp: incremental[str] = dlt.sources.incremental(
98
+ "SystemModstamp", initial_value=None
99
+ ),
100
+ ) -> Iterable[TDataItem]:
101
+ yield get_records(
102
+ client, "CampaignMember", last_timestamp.last_value, "SystemModstamp"
103
+ )
104
+
105
+ @dlt.resource(write_disposition="replace")
106
+ def product() -> Iterable[TDataItem]:
107
+ yield get_records(client, "Product2")
108
+
109
+ @dlt.resource(write_disposition="replace")
110
+ def pricebook() -> Iterable[TDataItem]:
111
+ yield get_records(client, "Pricebook2")
112
+
113
+ @dlt.resource(write_disposition="replace")
114
+ def pricebook_entry() -> Iterable[TDataItem]:
115
+ yield get_records(client, "PricebookEntry")
116
+
117
+ @dlt.resource(write_disposition="merge")
118
+ def task(
119
+ last_timestamp: incremental[str] = dlt.sources.incremental(
120
+ "SystemModstamp", initial_value=None
121
+ ),
122
+ ) -> Iterable[TDataItem]:
123
+ yield get_records(client, "Task", last_timestamp.last_value, "SystemModstamp")
124
+
125
+ @dlt.resource(write_disposition="merge")
126
+ def event(
127
+ last_timestamp: incremental[str] = dlt.sources.incremental(
128
+ "SystemModstamp", initial_value=None
129
+ ),
130
+ ) -> Iterable[TDataItem]:
131
+ yield get_records(client, "Event", last_timestamp.last_value, "SystemModstamp")
132
+
133
+ return (
134
+ user,
135
+ user_role,
136
+ opportunity,
137
+ opportunity_line_item,
138
+ opportunity_contact_role,
139
+ account,
140
+ contact,
141
+ lead,
142
+ campaign,
143
+ campaign_member,
144
+ product,
145
+ pricebook,
146
+ pricebook_entry,
147
+ task,
148
+ event,
149
+ )
@@ -0,0 +1,64 @@
1
+ """Salesforce source helpers"""
2
+
3
+ from typing import Iterable, Optional
4
+
5
+ import pendulum
6
+ from dlt.common.typing import TDataItem
7
+ from simple_salesforce import Salesforce
8
+
9
+
10
+ def get_records(
11
+ sf: Salesforce,
12
+ sobject: str,
13
+ last_state: Optional[str] = None,
14
+ replication_key: Optional[str] = None,
15
+ ) -> Iterable[TDataItem]:
16
+ """
17
+ Retrieves records from Salesforce for a specified sObject.
18
+
19
+ Args:
20
+ sf (Salesforce): An instance of the Salesforce API client.
21
+ sobject (str): The name of the sObject to retrieve records from.
22
+ last_state (str, optional): The last known state for incremental loading. Defaults to None.
23
+ replication_key (str, optional): The replication key for incremental loading. Defaults to None.
24
+
25
+ Yields:
26
+ Dict[TDataItem]: A dictionary representing a record from the Salesforce sObject.
27
+ """
28
+
29
+ # Get all fields for the sobject
30
+ desc = getattr(sf, sobject).describe()
31
+ # Salesforce returns compound fields as separate fields, so we need to filter them out
32
+ compound_fields = {
33
+ f["compoundFieldName"]
34
+ for f in desc["fields"]
35
+ if f["compoundFieldName"] is not None
36
+ } - {"Name"}
37
+ # Salesforce returns datetime fields as timestamps, so we need to convert them
38
+ date_fields = {
39
+ f["name"] for f in desc["fields"] if f["type"] in ("datetime",) and f["name"]
40
+ }
41
+ # If no fields are specified, use all fields except compound fields
42
+ fields = [f["name"] for f in desc["fields"] if f["name"] not in compound_fields]
43
+
44
+ # Generate a predicate to filter records by the replication key
45
+ predicate, order_by, n_records = "", "", 0
46
+ if replication_key:
47
+ if last_state:
48
+ predicate = f"WHERE {replication_key} > {last_state}"
49
+ order_by = f"ORDER BY {replication_key} ASC"
50
+ query = f"SELECT {', '.join(fields)} FROM {sobject} {predicate} {order_by}"
51
+
52
+ # Query all records in batches
53
+ for page in getattr(sf.bulk, sobject).query_all(query, lazy_operation=True):
54
+ for record in page:
55
+ # Strip out the attributes field
56
+ record.pop("attributes", None)
57
+ for field in date_fields:
58
+ # Convert Salesforce timestamps to ISO 8601
59
+ if record.get(field):
60
+ record[field] = pendulum.from_timestamp(
61
+ record[field] / 1000,
62
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
63
+ yield from page
64
+ n_records += len(page)
ingestr/src/sources.py CHANGED
@@ -15,7 +15,7 @@ from typing import (
15
15
  Optional,
16
16
  Union,
17
17
  )
18
- from urllib.parse import ParseResult, parse_qs, quote, urlparse
18
+ from urllib.parse import ParseResult, parse_qs, quote, urlencode, urlparse
19
19
 
20
20
  import dlt
21
21
  import gcsfs # type: ignore
@@ -83,6 +83,8 @@ from ingestr.src.linkedin_ads.dimension_time_enum import (
83
83
  )
84
84
  from ingestr.src.mongodb import mongodb_collection
85
85
  from ingestr.src.notion import notion_databases
86
+ from ingestr.src.personio import personio_source
87
+ from ingestr.src.salesforce import salesforce_source
86
88
  from ingestr.src.shopify import shopify_source
87
89
  from ingestr.src.slack import slack_source
88
90
  from ingestr.src.sql_database.callbacks import (
@@ -134,10 +136,46 @@ class SqlSource:
134
136
  if uri.startswith("mysql://"):
135
137
  uri = uri.replace("mysql://", "mysql+pymysql://")
136
138
 
139
+ # clickhouse://<username>:<password>@<host>:<port>?secure=<secure>
137
140
  if uri.startswith("clickhouse://"):
138
- uri = uri.replace("clickhouse://", "clickhouse+native://")
139
- if "secure=" not in uri:
140
- uri += "?secure=1"
141
+ parsed_uri = urlparse(uri)
142
+
143
+ username = parsed_uri.username
144
+ if not username:
145
+ raise ValueError(
146
+ "A username is required to connect to the ClickHouse database."
147
+ )
148
+
149
+ password = parsed_uri.password
150
+ if not password:
151
+ raise ValueError(
152
+ "A password is required to authenticate with the ClickHouse database."
153
+ )
154
+
155
+ host = parsed_uri.hostname
156
+ if not host:
157
+ raise ValueError(
158
+ "The hostname or IP address of the ClickHouse server is required to establish a connection."
159
+ )
160
+
161
+ port = parsed_uri.port
162
+ if not port:
163
+ raise ValueError(
164
+ "The TCP port of the ClickHouse server is required to establish a connection."
165
+ )
166
+
167
+ query_params = parse_qs(parsed_uri.query)
168
+
169
+ if "http_port" in query_params:
170
+ del query_params["http_port"]
171
+
172
+ if "secure" not in query_params:
173
+ query_params["secure"] = ["1"]
174
+
175
+ uri = parsed_uri._replace(
176
+ scheme="clickhouse+native",
177
+ query=urlencode(query_params, doseq=True),
178
+ ).geturl()
141
179
 
142
180
  query_adapters = []
143
181
  if kwargs.get("sql_limit"):
@@ -1753,7 +1791,7 @@ class AppLovinSource:
1753
1791
  def dlt_source(self, uri: str, table: str, **kwargs):
1754
1792
  if kwargs.get("incremental_key") is not None:
1755
1793
  raise ValueError(
1756
- "Google Ads takes care of incrementality on its own, you should not provide incremental_key"
1794
+ "Applovin takes care of incrementality on its own, you should not provide incremental_key"
1757
1795
  )
1758
1796
 
1759
1797
  parsed_uri = urlparse(uri)
@@ -1833,3 +1871,78 @@ class ApplovinMaxSource:
1833
1871
  api_key=api_key[0],
1834
1872
  application=application[0],
1835
1873
  ).with_resources(table)
1874
+
1875
+
1876
+ class SalesforceSource:
1877
+ def handles_incrementality(self) -> bool:
1878
+ return True
1879
+
1880
+ def dlt_source(self, uri: str, table: str, **kwargs):
1881
+ if kwargs.get("incremental_key"):
1882
+ raise ValueError(
1883
+ "Salesforce takes care of incrementality on its own, you should not provide incremental_key"
1884
+ )
1885
+
1886
+ params = parse_qs(urlparse(uri).query)
1887
+ creds = {
1888
+ "username": params.get("username", [None])[0],
1889
+ "password": params.get("password", [None])[0],
1890
+ "token": params.get("token", [None])[0],
1891
+ }
1892
+ for k, v in creds.items():
1893
+ if v is None:
1894
+ raise MissingValueError(k, "Salesforce")
1895
+
1896
+ src = salesforce_source(**creds) # type: ignore
1897
+
1898
+ if table not in src.resources:
1899
+ raise UnsupportedResourceError(table, "Salesforce")
1900
+
1901
+ return src.with_resources(table)
1902
+
1903
+
1904
+ class PersonioSource:
1905
+ def handles_incrementality(self) -> bool:
1906
+ return True
1907
+
1908
+ # applovin://?client_id=123&client_secret=123
1909
+ def dlt_source(self, uri: str, table: str, **kwargs):
1910
+ parsed_uri = urlparse(uri)
1911
+ params = parse_qs(parsed_uri.query)
1912
+
1913
+ client_id = params.get("client_id")
1914
+ client_secret = params.get("client_secret")
1915
+
1916
+ interval_start = kwargs.get("interval_start")
1917
+ interval_end = kwargs.get("interval_end")
1918
+
1919
+ interval_start_date = (
1920
+ interval_start if interval_start is not None else "2018-01-01"
1921
+ )
1922
+
1923
+ interval_end_date = (
1924
+ interval_end.strftime("%Y-%m-%d") if interval_end is not None else None
1925
+ )
1926
+
1927
+ if client_id is None:
1928
+ raise MissingValueError("client_id", "Personio")
1929
+ if client_secret is None:
1930
+ raise MissingValueError("client_secret", "Personio")
1931
+ if table not in [
1932
+ "employees",
1933
+ "absences",
1934
+ "absence_types",
1935
+ "attendances",
1936
+ "projects",
1937
+ "document_categories",
1938
+ "employees_absences_balance",
1939
+ "custom_reports_list",
1940
+ ]:
1941
+ raise UnsupportedResourceError(table, "Personio")
1942
+
1943
+ return personio_source(
1944
+ client_id=client_id[0],
1945
+ client_secret=client_secret[0],
1946
+ start_date=interval_start_date,
1947
+ end_date=interval_end_date,
1948
+ ).with_resources(table)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.9
3
+ Version: 0.13.11
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -18,42 +18,43 @@ Requires-Dist: asana==3.2.3
18
18
  Requires-Dist: clickhouse-connect==0.8.14
19
19
  Requires-Dist: clickhouse-driver==0.2.9
20
20
  Requires-Dist: clickhouse-sqlalchemy==0.2.7
21
- Requires-Dist: confluent-kafka>=2.6.1
21
+ Requires-Dist: confluent-kafka>=2.8.0
22
22
  Requires-Dist: databricks-sql-connector==2.9.3
23
23
  Requires-Dist: dataclasses-json==0.6.7
24
- Requires-Dist: dlt==1.5.0
25
- Requires-Dist: duckdb-engine==0.13.5
26
- Requires-Dist: duckdb==1.1.3
24
+ Requires-Dist: dlt==1.6.1
25
+ Requires-Dist: duckdb-engine==0.15.0
26
+ Requires-Dist: duckdb==1.2.0
27
27
  Requires-Dist: facebook-business==20.0.0
28
28
  Requires-Dist: flatten-json==0.1.14
29
29
  Requires-Dist: gcsfs==2024.10.0
30
30
  Requires-Dist: google-ads==25.1.0
31
- Requires-Dist: google-analytics-data==0.18.16
31
+ Requires-Dist: google-analytics-data==0.18.17
32
32
  Requires-Dist: google-api-python-client==2.130.0
33
33
  Requires-Dist: google-cloud-bigquery-storage==2.24.0
34
- Requires-Dist: mysql-connector-python==9.1.0
34
+ Requires-Dist: mysql-connector-python==9.2.0
35
35
  Requires-Dist: pendulum==3.0.0
36
36
  Requires-Dist: psutil==6.1.1
37
37
  Requires-Dist: psycopg2-binary==2.9.10
38
38
  Requires-Dist: py-machineid==0.6.0
39
39
  Requires-Dist: pyairtable==2.3.3
40
40
  Requires-Dist: pyarrow==18.1.0
41
- Requires-Dist: pyathena==3.9.0
42
- Requires-Dist: pymongo==4.10.1
41
+ Requires-Dist: pyathena==3.12.2
42
+ Requires-Dist: pymongo==4.11.1
43
43
  Requires-Dist: pymysql==1.1.1
44
44
  Requires-Dist: pyrate-limiter==3.7.0
45
45
  Requires-Dist: redshift-connector==2.1.5
46
46
  Requires-Dist: rich==13.9.4
47
47
  Requires-Dist: rudder-sdk-python==2.1.4
48
48
  Requires-Dist: s3fs==2024.10.0
49
+ Requires-Dist: simple-salesforce==1.12.6
49
50
  Requires-Dist: snowflake-sqlalchemy==1.6.1
50
- Requires-Dist: sqlalchemy-bigquery==1.12.0
51
+ Requires-Dist: sqlalchemy-bigquery==1.12.1
51
52
  Requires-Dist: sqlalchemy-hana==2.0.0
52
53
  Requires-Dist: sqlalchemy-redshift==0.8.14
53
54
  Requires-Dist: sqlalchemy2-stubs==0.0.2a38
54
55
  Requires-Dist: sqlalchemy==1.4.52
55
56
  Requires-Dist: stripe==10.7.0
56
- Requires-Dist: tqdm==4.67.0
57
+ Requires-Dist: tqdm==4.67.1
57
58
  Requires-Dist: typer==0.13.1
58
59
  Requires-Dist: types-requests==2.32.0.20240907
59
60
  Provides-Extra: odbc
@@ -161,6 +162,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
161
162
  <td>✅</td>
162
163
  <td>✅</td>
163
164
  </tr>
165
+ <tr>
166
+ <td>DynamoDB</td>
167
+ <td>✅</td>
168
+ <td>-</td>
169
+ </tr>
164
170
  <tr>
165
171
  <td>Local CSV file</td>
166
172
  <td>✅</td>
@@ -247,11 +253,6 @@ Pull requests are welcome. However, please open an issue first to discuss what y
247
253
  <td>✅</td>
248
254
  <td>-</td>
249
255
  </tr>
250
- <tr>
251
- <td>DynamoDB</td>
252
- <td>✅</td>
253
- <td>-</td>
254
- </tr>
255
256
  <tr>
256
257
  <td>Facebook Ads</td>
257
258
  <td>✅</td>
@@ -301,12 +302,22 @@ Pull requests are welcome. However, please open an issue first to discuss what y
301
302
  <td>Notion</td>
302
303
  <td>✅</td>
303
304
  <td>-</td>
305
+ </tr>
306
+ <tr>
307
+ <td>Personio</td>
308
+ <td>✅</td>
309
+ <td>-</td>
304
310
  </tr>
305
311
  <tr>
306
312
  <td>S3</td>
307
313
  <td>✅</td>
308
314
  <td>-</td>
309
315
  </tr>
316
+ <tr>
317
+ <td>Salesforce</td>
318
+ <td>✅</td>
319
+ <td>-</td>
320
+ </tr>
310
321
  <tr>
311
322
  <td>Shopify</td>
312
323
  <td>✅</td>
@@ -1,20 +1,20 @@
1
1
  ingestr/main.py,sha256=ufn8AcM2ID80ChUApJzYDjnQaurMXOkYfTm6GzAggSQ,24746
2
2
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
3
3
  ingestr/src/blob.py,sha256=LtEZWoUhm5i2aKerdgEpLtNCf3fdhGGMM4td-LRZVbY,1407
4
- ingestr/src/buildinfo.py,sha256=gK4juI0DAKgzAPnkZE1wP2N3AmMh6EZjH3gXGTAxWlc,20
4
+ ingestr/src/buildinfo.py,sha256=PnFKBMVizeXpYaYJ6rkY9m_oU0QCJzbLAOJyEQ8gyRg,21
5
5
  ingestr/src/destinations.py,sha256=vrGij4qMPCdXTMIimROWBJFqzOqCM4DFmgyubgSHejA,11279
6
6
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
7
- ingestr/src/factory.py,sha256=XYwjy5dfG5mLIU1v-mS17Kwl0cxSs3MG7NtgPPwZ_0U,5009
7
+ ingestr/src/factory.py,sha256=dOdY4fzeQ-2dgFBGIDFD5ilxpYNfCVqQOureuWzOL-w,5127
8
8
  ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
9
9
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
10
- ingestr/src/sources.py,sha256=ljh__y_ZXj8NUT0v63ZAT42K1SZsEJEB88YtQHG0IXQ,64830
10
+ ingestr/src/sources.py,sha256=YlWokgTZoeMQ6PVb9UVU3I99R0cdhkYjEzPf5LNGs30,68582
11
11
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
12
12
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
13
13
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
14
14
  ingestr/src/adjust/__init__.py,sha256=ULjtJqrNS6XDvUyGl0tjl12-tLyXlCgeFe2icTbtu3Q,3255
15
15
  ingestr/src/adjust/adjust_helpers.py,sha256=av97NPSn-hQtTbAC0vUSCAWYePmOiG5R-DGdMssm7FQ,3646
16
16
  ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
17
- ingestr/src/applovin/__init__.py,sha256=vtmYnRKnNOSzFWQIbKGbrcu6AcBdHuhPMsNruUvEIgg,7000
17
+ ingestr/src/applovin/__init__.py,sha256=X_YCLppPrnL8KXfYWICE_uDfMzHHH3JZ-DBGZ1RlaOI,6984
18
18
  ingestr/src/applovin_max/__init__.py,sha256=1NUOeJzRyZZQ95KEirbrlSrk-8SNc9JrlM_5pGgBgHg,2878
19
19
  ingestr/src/appsflyer/_init_.py,sha256=ne2-9FQ654Drtd3GkKQv8Bwb6LEqCnJw49MfO5Jyzgs,739
20
20
  ingestr/src/appsflyer/client.py,sha256=TNmwakLzmO6DZW3wcfLfQRl7aNBHgFqSsk4ef-MmJ1w,3084
@@ -74,6 +74,10 @@ ingestr/src/notion/settings.py,sha256=MwQVZViJtnvOegfjXYc_pJ50oUYgSRPgwqu7TvpeMO
74
74
  ingestr/src/notion/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
75
  ingestr/src/notion/helpers/client.py,sha256=QXuudkf5Zzff98HRsCqA1g1EZWIrnfn1falPrnKg_y4,5500
76
76
  ingestr/src/notion/helpers/database.py,sha256=gigPibTeVefP3lA-8w4aOwX67pj7RlciPk5koDs1ry8,2737
77
+ ingestr/src/personio/__init__.py,sha256=CQ8XX8Q8BG-wgoen3emhe_r8Cx414Fux7P8jQNawWvY,11646
78
+ ingestr/src/personio/helpers.py,sha256=OmeMzfg4MVtpI7f75D3-9OGZb8SDsKyz0svNm1zJLTw,2900
79
+ ingestr/src/salesforce/__init__.py,sha256=2hik5pRrxVODdDTlUEMoyccNC07zozjnxkMHcjMT1qA,4558
80
+ ingestr/src/salesforce/helpers.py,sha256=QTdazBt-qRTBbCQMZnyclIaDQFmBixBy_RDKD00Lt-8,2492
77
81
  ingestr/src/shopify/__init__.py,sha256=PF_6VQnS065Br1UzSIekTVXBu3WtrMQL_v5CfbfaX5Y,63151
78
82
  ingestr/src/shopify/exceptions.py,sha256=BhV3lIVWeBt8Eh4CWGW_REFJpGCzvW6-62yZrBWa3nQ,50
79
83
  ingestr/src/shopify/helpers.py,sha256=NfHD6lWXe88ybR0ri-FCQuh2Vf8l5WG0a0FVjmdoSC4,6296
@@ -104,8 +108,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
104
108
  ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
105
109
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
106
110
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
107
- ingestr-0.13.9.dist-info/METADATA,sha256=aPaAzUYc-2EPu4a0xtimG6l9InUxsWPJ1hFb6-qbUdQ,8956
108
- ingestr-0.13.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
109
- ingestr-0.13.9.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
110
- ingestr-0.13.9.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
111
- ingestr-0.13.9.dist-info/RECORD,,
111
+ ingestr-0.13.11.dist-info/METADATA,sha256=8vjvshEDHgAZEMt3ykbUSlEl_Ky0KtHf6p6vjT6RDGI,9171
112
+ ingestr-0.13.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
113
+ ingestr-0.13.11.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
114
+ ingestr-0.13.11.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
115
+ ingestr-0.13.11.dist-info/RECORD,,