ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin/__init__.py +262 -0
- ingestr/src/applovin_max/__init__.py +117 -0
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +38 -11
- ingestr/src/buildinfo.py +1 -0
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +520 -33
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +116 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/loader.py +69 -0
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/personio/__init__.py +331 -0
- ingestr/src/personio/helpers.py +86 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +156 -0
- ingestr/src/salesforce/helpers.py +64 -0
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +3132 -212
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/version.py +6 -1
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- ingestr-0.14.104.dist-info/METADATA +563 -0
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.2.dist-info/METADATA +0 -302
- ingestr-0.13.2.dist-info/RECORD +0 -107
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterator, Optional
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
import pendulum
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
LINEAR_GRAPHQL_ENDPOINT = "https://api.linear.app/graphql"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _graphql(
|
|
11
|
+
api_key: str, query: str, variables: Optional[Dict[str, Any]] = None
|
|
12
|
+
) -> Dict[str, Any]:
|
|
13
|
+
headers = {"Authorization": api_key, "Content-Type": "application/json"}
|
|
14
|
+
response = requests.post(
|
|
15
|
+
LINEAR_GRAPHQL_ENDPOINT,
|
|
16
|
+
json={"query": query, "variables": variables or {}},
|
|
17
|
+
headers=headers,
|
|
18
|
+
)
|
|
19
|
+
response.raise_for_status()
|
|
20
|
+
payload = response.json()
|
|
21
|
+
if "errors" in payload:
|
|
22
|
+
raise ValueError(str(payload["errors"]))
|
|
23
|
+
return payload["data"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _paginate(api_key: str, query: str, root: str) -> Iterator[Dict[str, Any]]:
|
|
27
|
+
cursor: Optional[str] = None
|
|
28
|
+
while True:
|
|
29
|
+
data = _graphql(api_key, query, {"cursor": cursor})[root]
|
|
30
|
+
for item in data["nodes"]:
|
|
31
|
+
yield item
|
|
32
|
+
if not data["pageInfo"]["hasNextPage"]:
|
|
33
|
+
break
|
|
34
|
+
cursor = data["pageInfo"]["endCursor"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_date_range(updated_at, start_date):
|
|
38
|
+
"""Extract current start and end dates from incremental state."""
|
|
39
|
+
if updated_at.last_value:
|
|
40
|
+
current_start_date = pendulum.parse(updated_at.last_value)
|
|
41
|
+
else:
|
|
42
|
+
current_start_date = pendulum.parse(start_date)
|
|
43
|
+
|
|
44
|
+
if updated_at.end_value:
|
|
45
|
+
current_end_date = pendulum.parse(updated_at.end_value)
|
|
46
|
+
else:
|
|
47
|
+
current_end_date = pendulum.now(tz="UTC")
|
|
48
|
+
|
|
49
|
+
return current_start_date, current_end_date
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _paginated_resource(
|
|
53
|
+
api_key: str, query: str, query_field: str, updated_at, start_date
|
|
54
|
+
) -> Iterator[Dict[str, Any]]:
|
|
55
|
+
"""Helper function for paginated resources with date filtering."""
|
|
56
|
+
current_start_date, current_end_date = _get_date_range(updated_at, start_date)
|
|
57
|
+
|
|
58
|
+
for item in _paginate(api_key, query, query_field):
|
|
59
|
+
if pendulum.parse(item["updatedAt"]) >= current_start_date:
|
|
60
|
+
if pendulum.parse(item["updatedAt"]) <= current_end_date:
|
|
61
|
+
yield normalize_dictionaries(item)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _create_paginated_resource(
|
|
65
|
+
resource_name: str,
|
|
66
|
+
query: str,
|
|
67
|
+
query_field: str,
|
|
68
|
+
api_key: str,
|
|
69
|
+
start_date,
|
|
70
|
+
end_date=None,
|
|
71
|
+
):
|
|
72
|
+
"""Factory function to create paginated resources dynamically."""
|
|
73
|
+
|
|
74
|
+
@dlt.resource(name=resource_name, primary_key="id", write_disposition="merge")
|
|
75
|
+
def paginated_resource(
|
|
76
|
+
updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
77
|
+
"updatedAt",
|
|
78
|
+
initial_value=start_date.isoformat(),
|
|
79
|
+
end_value=end_date.isoformat() if end_date else None,
|
|
80
|
+
range_start="closed",
|
|
81
|
+
range_end="closed",
|
|
82
|
+
),
|
|
83
|
+
) -> Iterator[Dict[str, Any]]:
|
|
84
|
+
for item in _paginated_resource(
|
|
85
|
+
api_key, query, query_field, updated_at, start_date
|
|
86
|
+
):
|
|
87
|
+
yield normalize_dictionaries(item)
|
|
88
|
+
|
|
89
|
+
return paginated_resource
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def normalize_dictionaries(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
93
|
+
"""
|
|
94
|
+
Automatically normalize dictionary fields by detecting their structure:
|
|
95
|
+
- Convert nested objects with 'id' field to {field_name}_id
|
|
96
|
+
- Convert objects with 'nodes' field to arrays
|
|
97
|
+
|
|
98
|
+
"""
|
|
99
|
+
normalized_item = item.copy()
|
|
100
|
+
|
|
101
|
+
for key, value in list(normalized_item.items()):
|
|
102
|
+
if isinstance(value, dict):
|
|
103
|
+
# If the dict has an 'id' field, replace with {key}_id
|
|
104
|
+
if "id" in value:
|
|
105
|
+
normalized_item[f"{key}_id"] = value["id"]
|
|
106
|
+
del normalized_item[key]
|
|
107
|
+
# If the dict has 'nodes' field, extract the nodes array
|
|
108
|
+
elif "nodes" in value:
|
|
109
|
+
normalized_item[key] = value["nodes"]
|
|
110
|
+
|
|
111
|
+
return normalized_item
|
ingestr/src/loader.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import gzip
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from typing import Generator
|
|
7
|
+
|
|
8
|
+
from pyarrow.parquet import ParquetFile # type: ignore
|
|
9
|
+
|
|
10
|
+
PARQUET_BATCH_SIZE = 64
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class UnsupportedLoaderFileFormat(Exception):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_dlt_file(filepath: str) -> Generator:
|
|
18
|
+
"""
|
|
19
|
+
load_dlt_file reads dlt loader files. It handles different loader file formats
|
|
20
|
+
automatically. It returns a generator that yield data items as a python dict
|
|
21
|
+
"""
|
|
22
|
+
result = subprocess.run(
|
|
23
|
+
["file", "-b", filepath],
|
|
24
|
+
check=True,
|
|
25
|
+
capture_output=True,
|
|
26
|
+
text=True,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
filetype = result.stdout.strip()
|
|
30
|
+
with factory(filetype, filepath) as reader:
|
|
31
|
+
yield from reader
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def factory(filetype: str, filepath: str):
|
|
35
|
+
# ???(turtledev): can dlt produce non-gizpped jsonl files?
|
|
36
|
+
if filetype.startswith("gzip"):
|
|
37
|
+
return jsonlfile(filepath)
|
|
38
|
+
elif filetype.startswith("CSV"):
|
|
39
|
+
return csvfile(filepath)
|
|
40
|
+
elif filetype.startswith("Apache Parquet"):
|
|
41
|
+
return parquetfile(filepath)
|
|
42
|
+
else:
|
|
43
|
+
raise UnsupportedLoaderFileFormat(filetype)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@contextmanager
|
|
47
|
+
def jsonlfile(filepath: str):
|
|
48
|
+
def reader(fd):
|
|
49
|
+
for line in fd:
|
|
50
|
+
yield json.loads(line.decode().strip())
|
|
51
|
+
|
|
52
|
+
with gzip.open(filepath) as fd:
|
|
53
|
+
yield reader(fd)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@contextmanager
|
|
57
|
+
def csvfile(filepath: str):
|
|
58
|
+
with open(filepath, "r") as fd:
|
|
59
|
+
yield csv.DictReader(fd)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@contextmanager
|
|
63
|
+
def parquetfile(filepath: str):
|
|
64
|
+
def reader(pf: ParquetFile):
|
|
65
|
+
for batch in pf.iter_batches(PARQUET_BATCH_SIZE):
|
|
66
|
+
yield from batch.to_pylist()
|
|
67
|
+
|
|
68
|
+
with open(filepath, "rb") as fd:
|
|
69
|
+
yield reader(ParquetFile(fd))
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mailchimp source for data extraction via REST API.
|
|
3
|
+
|
|
4
|
+
This source provides access to Mailchimp account data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Iterable, Iterator
|
|
8
|
+
|
|
9
|
+
import dlt
|
|
10
|
+
from dlt.sources import DltResource
|
|
11
|
+
|
|
12
|
+
from ingestr.src.http_client import create_client
|
|
13
|
+
from ingestr.src.mailchimp.helpers import (
|
|
14
|
+
create_merge_resource,
|
|
15
|
+
create_nested_resource,
|
|
16
|
+
create_replace_resource,
|
|
17
|
+
)
|
|
18
|
+
from ingestr.src.mailchimp.settings import (
|
|
19
|
+
MERGE_ENDPOINTS,
|
|
20
|
+
NESTED_ENDPOINTS,
|
|
21
|
+
REPLACE_ENDPOINTS,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dlt.source(max_table_nesting=0, name="mailchimp_source")
|
|
26
|
+
def mailchimp_source(
|
|
27
|
+
api_key: str,
|
|
28
|
+
server: str,
|
|
29
|
+
) -> Iterable[DltResource]:
|
|
30
|
+
"""
|
|
31
|
+
Mailchimp data source.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
api_key: Mailchimp API key for authentication
|
|
35
|
+
server: Server prefix (e.g., 'us10')
|
|
36
|
+
|
|
37
|
+
Yields:
|
|
38
|
+
DltResource: Data resources for Mailchimp data
|
|
39
|
+
"""
|
|
40
|
+
base_url = f"https://{server}.api.mailchimp.com/3.0"
|
|
41
|
+
session = create_client()
|
|
42
|
+
auth = ("anystring", api_key)
|
|
43
|
+
|
|
44
|
+
@dlt.resource(
|
|
45
|
+
name="account",
|
|
46
|
+
write_disposition="replace",
|
|
47
|
+
)
|
|
48
|
+
def fetch_account() -> Iterator[dict[str, Any]]:
|
|
49
|
+
"""
|
|
50
|
+
Fetch account information from Mailchimp.
|
|
51
|
+
|
|
52
|
+
Table format: account (no parameters needed)
|
|
53
|
+
"""
|
|
54
|
+
response = session.get(f"{base_url}/", auth=auth)
|
|
55
|
+
response.raise_for_status()
|
|
56
|
+
data = response.json()
|
|
57
|
+
yield data
|
|
58
|
+
|
|
59
|
+
# Create resources dynamically
|
|
60
|
+
resources = [fetch_account]
|
|
61
|
+
|
|
62
|
+
# Create merge resources (with incremental loading)
|
|
63
|
+
for (
|
|
64
|
+
resource_name,
|
|
65
|
+
endpoint_path,
|
|
66
|
+
data_key,
|
|
67
|
+
primary_key,
|
|
68
|
+
incremental_key,
|
|
69
|
+
) in MERGE_ENDPOINTS:
|
|
70
|
+
resources.append(
|
|
71
|
+
create_merge_resource(
|
|
72
|
+
base_url,
|
|
73
|
+
session,
|
|
74
|
+
auth,
|
|
75
|
+
resource_name,
|
|
76
|
+
endpoint_path,
|
|
77
|
+
data_key,
|
|
78
|
+
primary_key,
|
|
79
|
+
incremental_key,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Create replace resources (without incremental loading)
|
|
84
|
+
for replace_endpoint in REPLACE_ENDPOINTS:
|
|
85
|
+
resource_name, endpoint_path, data_key, pk = replace_endpoint
|
|
86
|
+
resources.append(
|
|
87
|
+
create_replace_resource(
|
|
88
|
+
base_url,
|
|
89
|
+
session,
|
|
90
|
+
auth,
|
|
91
|
+
resource_name,
|
|
92
|
+
endpoint_path,
|
|
93
|
+
data_key,
|
|
94
|
+
pk,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Create nested resources (depend on parent resources)
|
|
99
|
+
for nested_endpoint in NESTED_ENDPOINTS:
|
|
100
|
+
(
|
|
101
|
+
parent_name,
|
|
102
|
+
parent_path,
|
|
103
|
+
parent_key,
|
|
104
|
+
parent_id_field,
|
|
105
|
+
nested_name,
|
|
106
|
+
nested_path,
|
|
107
|
+
nested_key,
|
|
108
|
+
pk,
|
|
109
|
+
) = nested_endpoint
|
|
110
|
+
resources.append(
|
|
111
|
+
create_nested_resource(
|
|
112
|
+
base_url,
|
|
113
|
+
session,
|
|
114
|
+
auth,
|
|
115
|
+
parent_name,
|
|
116
|
+
parent_path,
|
|
117
|
+
parent_key,
|
|
118
|
+
parent_id_field,
|
|
119
|
+
nested_name,
|
|
120
|
+
nested_path,
|
|
121
|
+
nested_key,
|
|
122
|
+
pk,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return tuple(resources)
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Helper functions for Mailchimp source.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Iterator
|
|
6
|
+
|
|
7
|
+
import dlt
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def fetch_paginated(
|
|
11
|
+
session,
|
|
12
|
+
url: str,
|
|
13
|
+
auth: tuple,
|
|
14
|
+
data_key: str | None = None,
|
|
15
|
+
) -> Iterator[dict[str, Any]]:
|
|
16
|
+
"""
|
|
17
|
+
Helper function to fetch paginated data from Mailchimp API.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
session: HTTP session
|
|
21
|
+
url: API endpoint URL
|
|
22
|
+
auth: Authentication tuple
|
|
23
|
+
data_key: Key in response containing the data array (if None, return whole response)
|
|
24
|
+
|
|
25
|
+
Yields:
|
|
26
|
+
Individual items from the paginated response
|
|
27
|
+
"""
|
|
28
|
+
offset = 0
|
|
29
|
+
count = 1000 # Maximum allowed by Mailchimp
|
|
30
|
+
|
|
31
|
+
while True:
|
|
32
|
+
params = {"count": count, "offset": offset}
|
|
33
|
+
response = session.get(url, auth=auth, params=params)
|
|
34
|
+
response.raise_for_status()
|
|
35
|
+
data = response.json()
|
|
36
|
+
|
|
37
|
+
# Extract items from response
|
|
38
|
+
if data_key and data_key in data:
|
|
39
|
+
items = data[data_key]
|
|
40
|
+
elif isinstance(data, list):
|
|
41
|
+
items = data
|
|
42
|
+
else:
|
|
43
|
+
# If no data_key specified and response is dict, yield the whole response
|
|
44
|
+
yield data
|
|
45
|
+
break
|
|
46
|
+
|
|
47
|
+
if not items:
|
|
48
|
+
break
|
|
49
|
+
|
|
50
|
+
yield from items
|
|
51
|
+
|
|
52
|
+
# Check if we've received fewer items than requested (last page)
|
|
53
|
+
if len(items) < count:
|
|
54
|
+
break
|
|
55
|
+
|
|
56
|
+
offset += count
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def create_merge_resource(
|
|
60
|
+
base_url: str,
|
|
61
|
+
session,
|
|
62
|
+
auth: tuple,
|
|
63
|
+
name: str,
|
|
64
|
+
path: str,
|
|
65
|
+
key: str,
|
|
66
|
+
pk: str,
|
|
67
|
+
ik: str,
|
|
68
|
+
):
|
|
69
|
+
"""
|
|
70
|
+
Create a DLT resource with merge disposition for incremental loading.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
base_url: Base API URL
|
|
74
|
+
session: HTTP session
|
|
75
|
+
auth: Authentication tuple
|
|
76
|
+
name: Resource name
|
|
77
|
+
path: API endpoint path
|
|
78
|
+
key: Data key in response
|
|
79
|
+
pk: Primary key field
|
|
80
|
+
ik: Incremental key field
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
DLT resource function
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
@dlt.resource(
|
|
87
|
+
name=name,
|
|
88
|
+
write_disposition="merge",
|
|
89
|
+
primary_key=pk,
|
|
90
|
+
)
|
|
91
|
+
def fetch_data(
|
|
92
|
+
updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
93
|
+
ik, initial_value=None
|
|
94
|
+
),
|
|
95
|
+
) -> Iterator[dict[str, Any]]:
|
|
96
|
+
url = f"{base_url}/{path}"
|
|
97
|
+
yield from fetch_paginated(session, url, auth, data_key=key)
|
|
98
|
+
|
|
99
|
+
return fetch_data
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def create_replace_resource(
|
|
103
|
+
base_url: str,
|
|
104
|
+
session,
|
|
105
|
+
auth: tuple,
|
|
106
|
+
name: str,
|
|
107
|
+
path: str,
|
|
108
|
+
key: str,
|
|
109
|
+
pk: str | None,
|
|
110
|
+
):
|
|
111
|
+
"""
|
|
112
|
+
Create a DLT resource with replace disposition.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
base_url: Base API URL
|
|
116
|
+
session: HTTP session
|
|
117
|
+
auth: Authentication tuple
|
|
118
|
+
name: Resource name
|
|
119
|
+
path: API endpoint path
|
|
120
|
+
key: Data key in response
|
|
121
|
+
pk: Primary key field (optional)
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
DLT resource function
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def fetch_data() -> Iterator[dict[str, Any]]:
|
|
128
|
+
url = f"{base_url}/{path}"
|
|
129
|
+
yield from fetch_paginated(session, url, auth, data_key=key)
|
|
130
|
+
|
|
131
|
+
# Apply the resource decorator with conditional primary_key
|
|
132
|
+
if pk is not None:
|
|
133
|
+
return dlt.resource(
|
|
134
|
+
fetch_data,
|
|
135
|
+
name=name,
|
|
136
|
+
write_disposition="replace",
|
|
137
|
+
primary_key=pk,
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
return dlt.resource(
|
|
141
|
+
fetch_data,
|
|
142
|
+
name=name,
|
|
143
|
+
write_disposition="replace",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def create_nested_resource(
|
|
148
|
+
base_url: str,
|
|
149
|
+
session,
|
|
150
|
+
auth: tuple,
|
|
151
|
+
parent_resource_name: str,
|
|
152
|
+
parent_path: str,
|
|
153
|
+
parent_key: str,
|
|
154
|
+
parent_id_field: str,
|
|
155
|
+
nested_name: str,
|
|
156
|
+
nested_path: str,
|
|
157
|
+
nested_key: str | None,
|
|
158
|
+
pk: str | None,
|
|
159
|
+
):
|
|
160
|
+
"""
|
|
161
|
+
Create a nested DLT resource that depends on a parent resource.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
base_url: Base API URL
|
|
165
|
+
session: HTTP session
|
|
166
|
+
auth: Authentication tuple
|
|
167
|
+
parent_resource_name: Name of the parent resource
|
|
168
|
+
parent_path: Parent API endpoint path
|
|
169
|
+
parent_key: Data key in parent response
|
|
170
|
+
parent_id_field: Field name for parent ID
|
|
171
|
+
nested_name: Nested resource name
|
|
172
|
+
nested_path: Nested API endpoint path (with {id} placeholder)
|
|
173
|
+
nested_key: Data key in nested response (None to return whole response)
|
|
174
|
+
pk: Primary key field (optional)
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
DLT resource function
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
def fetch_nested_data() -> Iterator[dict[str, Any]]:
|
|
181
|
+
# First, fetch parent items
|
|
182
|
+
parent_url = f"{base_url}/{parent_path}"
|
|
183
|
+
parent_items = fetch_paginated(session, parent_url, auth, data_key=parent_key)
|
|
184
|
+
|
|
185
|
+
# For each parent item, fetch nested data
|
|
186
|
+
for parent_item in parent_items:
|
|
187
|
+
parent_id = parent_item.get(parent_id_field)
|
|
188
|
+
if parent_id:
|
|
189
|
+
# Build nested URL with parent ID
|
|
190
|
+
nested_url = f"{base_url}/{nested_path.format(id=parent_id)}"
|
|
191
|
+
|
|
192
|
+
# Fetch nested data
|
|
193
|
+
response = session.get(nested_url, auth=auth)
|
|
194
|
+
response.raise_for_status()
|
|
195
|
+
data = response.json()
|
|
196
|
+
|
|
197
|
+
# Extract nested items or return whole response
|
|
198
|
+
if nested_key and nested_key in data:
|
|
199
|
+
items = data[nested_key]
|
|
200
|
+
if isinstance(items, list):
|
|
201
|
+
for item in items:
|
|
202
|
+
# Add parent reference
|
|
203
|
+
item[f"{parent_resource_name}_id"] = parent_id
|
|
204
|
+
yield item
|
|
205
|
+
else:
|
|
206
|
+
items[f"{parent_resource_name}_id"] = parent_id
|
|
207
|
+
yield items
|
|
208
|
+
else:
|
|
209
|
+
# Return whole response with parent reference
|
|
210
|
+
data[f"{parent_resource_name}_id"] = parent_id
|
|
211
|
+
yield data
|
|
212
|
+
|
|
213
|
+
# Apply the resource decorator with conditional primary_key
|
|
214
|
+
if pk is not None:
|
|
215
|
+
return dlt.resource(
|
|
216
|
+
fetch_nested_data,
|
|
217
|
+
name=nested_name,
|
|
218
|
+
write_disposition="replace",
|
|
219
|
+
primary_key=pk,
|
|
220
|
+
)
|
|
221
|
+
else:
|
|
222
|
+
return dlt.resource(
|
|
223
|
+
fetch_nested_data,
|
|
224
|
+
name=nested_name,
|
|
225
|
+
write_disposition="replace",
|
|
226
|
+
)
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mailchimp API endpoint configurations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# Endpoints with merge disposition (have both primary_key and incremental_key)
|
|
6
|
+
# Format: (resource_name, endpoint_path, data_key, primary_key, incremental_key)
|
|
7
|
+
MERGE_ENDPOINTS = [
|
|
8
|
+
("audiences", "lists", "lists", "id", "date_created"),
|
|
9
|
+
("automations", "automations", "automations", "id", "create_time"),
|
|
10
|
+
("campaigns", "campaigns", "campaigns", "id", "create_time"),
|
|
11
|
+
("connected_sites", "connected-sites", "sites", "id", "updated_at"),
|
|
12
|
+
("conversations", "conversations", "conversations", "id", "last_message.timestamp"),
|
|
13
|
+
("ecommerce_stores", "ecommerce/stores", "stores", "id", "updated_at"),
|
|
14
|
+
("facebook_ads", "facebook-ads", "facebook_ads", "id", "updated_at"),
|
|
15
|
+
("landing_pages", "landing-pages", "landing_pages", "id", "updated_at"),
|
|
16
|
+
("reports", "reports", "reports", "id", "send_time"),
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
# Endpoints with replace disposition
|
|
20
|
+
# Format: (resource_name, endpoint_path, data_key, primary_key)
|
|
21
|
+
REPLACE_ENDPOINTS: list[tuple[str, str, str, str | None]] = [
|
|
22
|
+
("account_exports", "account-exports", "exports", None),
|
|
23
|
+
("authorized_apps", "authorized-apps", "apps", "id"),
|
|
24
|
+
("batches", "batches", "batches", None),
|
|
25
|
+
("campaign_folders", "campaign-folders", "folders", "id"),
|
|
26
|
+
("chimp_chatter", "activity-feed/chimp-chatter", "chimp_chatter", None),
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Nested endpoints (depend on parent resources)
|
|
30
|
+
# Format: (parent_name, parent_path, parent_key, parent_id_field, nested_name, nested_path, nested_key, pk)
|
|
31
|
+
NESTED_ENDPOINTS: list[tuple[str, str, str, str, str, str, str | None, str | None]] = [
|
|
32
|
+
# Reports nested endpoints
|
|
33
|
+
(
|
|
34
|
+
"reports",
|
|
35
|
+
"reports",
|
|
36
|
+
"reports",
|
|
37
|
+
"id",
|
|
38
|
+
"reports_advice",
|
|
39
|
+
"reports/{id}/advice",
|
|
40
|
+
None,
|
|
41
|
+
None,
|
|
42
|
+
),
|
|
43
|
+
(
|
|
44
|
+
"reports",
|
|
45
|
+
"reports",
|
|
46
|
+
"reports",
|
|
47
|
+
"id",
|
|
48
|
+
"reports_domain_performance",
|
|
49
|
+
"reports/{id}/domain-performance",
|
|
50
|
+
"domains",
|
|
51
|
+
None,
|
|
52
|
+
),
|
|
53
|
+
(
|
|
54
|
+
"reports",
|
|
55
|
+
"reports",
|
|
56
|
+
"reports",
|
|
57
|
+
"id",
|
|
58
|
+
"reports_locations",
|
|
59
|
+
"reports/{id}/locations",
|
|
60
|
+
"locations",
|
|
61
|
+
None,
|
|
62
|
+
),
|
|
63
|
+
(
|
|
64
|
+
"reports",
|
|
65
|
+
"reports",
|
|
66
|
+
"reports",
|
|
67
|
+
"id",
|
|
68
|
+
"reports_sent_to",
|
|
69
|
+
"reports/{id}/sent-to",
|
|
70
|
+
"sent_to",
|
|
71
|
+
None,
|
|
72
|
+
),
|
|
73
|
+
(
|
|
74
|
+
"reports",
|
|
75
|
+
"reports",
|
|
76
|
+
"reports",
|
|
77
|
+
"id",
|
|
78
|
+
"reports_sub_reports",
|
|
79
|
+
"reports/{id}/sub-reports",
|
|
80
|
+
None,
|
|
81
|
+
None,
|
|
82
|
+
),
|
|
83
|
+
(
|
|
84
|
+
"reports",
|
|
85
|
+
"reports",
|
|
86
|
+
"reports",
|
|
87
|
+
"id",
|
|
88
|
+
"reports_unsubscribed",
|
|
89
|
+
"reports/{id}/unsubscribed",
|
|
90
|
+
"unsubscribes",
|
|
91
|
+
None,
|
|
92
|
+
),
|
|
93
|
+
# Lists/Audiences nested endpoints
|
|
94
|
+
(
|
|
95
|
+
"audiences",
|
|
96
|
+
"lists",
|
|
97
|
+
"lists",
|
|
98
|
+
"id",
|
|
99
|
+
"lists_activity",
|
|
100
|
+
"lists/{id}/activity",
|
|
101
|
+
"activity",
|
|
102
|
+
None,
|
|
103
|
+
),
|
|
104
|
+
(
|
|
105
|
+
"audiences",
|
|
106
|
+
"lists",
|
|
107
|
+
"lists",
|
|
108
|
+
"id",
|
|
109
|
+
"lists_clients",
|
|
110
|
+
"lists/{id}/clients",
|
|
111
|
+
"clients",
|
|
112
|
+
None,
|
|
113
|
+
),
|
|
114
|
+
(
|
|
115
|
+
"audiences",
|
|
116
|
+
"lists",
|
|
117
|
+
"lists",
|
|
118
|
+
"id",
|
|
119
|
+
"lists_growth_history",
|
|
120
|
+
"lists/{id}/growth-history",
|
|
121
|
+
"history",
|
|
122
|
+
None,
|
|
123
|
+
),
|
|
124
|
+
(
|
|
125
|
+
"audiences",
|
|
126
|
+
"lists",
|
|
127
|
+
"lists",
|
|
128
|
+
"id",
|
|
129
|
+
"lists_interest_categories",
|
|
130
|
+
"lists/{id}/interest-categories",
|
|
131
|
+
"categories",
|
|
132
|
+
None,
|
|
133
|
+
),
|
|
134
|
+
(
|
|
135
|
+
"audiences",
|
|
136
|
+
"lists",
|
|
137
|
+
"lists",
|
|
138
|
+
"id",
|
|
139
|
+
"lists_locations",
|
|
140
|
+
"lists/{id}/locations",
|
|
141
|
+
"locations",
|
|
142
|
+
None,
|
|
143
|
+
),
|
|
144
|
+
(
|
|
145
|
+
"audiences",
|
|
146
|
+
"lists",
|
|
147
|
+
"lists",
|
|
148
|
+
"id",
|
|
149
|
+
"lists_merge_fields",
|
|
150
|
+
"lists/{id}/merge-fields",
|
|
151
|
+
"merge_fields",
|
|
152
|
+
None,
|
|
153
|
+
),
|
|
154
|
+
(
|
|
155
|
+
"audiences",
|
|
156
|
+
"lists",
|
|
157
|
+
"lists",
|
|
158
|
+
"id",
|
|
159
|
+
"lists_segments",
|
|
160
|
+
"lists/{id}/segments",
|
|
161
|
+
"segments",
|
|
162
|
+
None,
|
|
163
|
+
),
|
|
164
|
+
]
|