ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin/__init__.py +262 -0
- ingestr/src/applovin_max/__init__.py +117 -0
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +38 -11
- ingestr/src/buildinfo.py +1 -0
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +520 -33
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +116 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/loader.py +69 -0
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/personio/__init__.py +331 -0
- ingestr/src/personio/helpers.py +86 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +156 -0
- ingestr/src/salesforce/helpers.py +64 -0
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +3132 -212
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/version.py +6 -1
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- ingestr-0.14.104.dist-info/METADATA +563 -0
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.2.dist-info/METADATA +0 -302
- ingestr-0.13.2.dist-info/RECORD +0 -107
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Elasticsearch destination helpers"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, Dict, Iterator
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
import dlt
|
|
9
|
+
|
|
10
|
+
from elasticsearch import Elasticsearch
|
|
11
|
+
from elasticsearch.helpers import bulk
|
|
12
|
+
|
|
13
|
+
# Suppress Elasticsearch transport logging
|
|
14
|
+
logging.getLogger("elasticsearch.transport").setLevel(logging.WARNING)
|
|
15
|
+
logging.getLogger("elastic_transport.transport").setLevel(logging.WARNING)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def process_file_items(file_path: str) -> Iterator[Dict[str, Any]]:
|
|
19
|
+
"""Process items from a file path (JSONL format)."""
|
|
20
|
+
with open(file_path, "r") as f:
|
|
21
|
+
for line in f:
|
|
22
|
+
if line.strip():
|
|
23
|
+
doc = json.loads(line.strip())
|
|
24
|
+
# Clean DLT metadata
|
|
25
|
+
cleaned_doc = {
|
|
26
|
+
k: v for k, v in doc.items() if not k.startswith("_dlt_")
|
|
27
|
+
}
|
|
28
|
+
yield cleaned_doc
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def process_iterable_items(items: Any) -> Iterator[Dict[str, Any]]:
|
|
32
|
+
"""Process items from an iterable."""
|
|
33
|
+
for item in items:
|
|
34
|
+
if isinstance(item, dict):
|
|
35
|
+
# Clean DLT metadata
|
|
36
|
+
cleaned_item = {k: v for k, v in item.items() if not k.startswith("_dlt_")}
|
|
37
|
+
yield cleaned_item
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dlt.destination(
|
|
41
|
+
name="elasticsearch",
|
|
42
|
+
loader_file_format="typed-jsonl",
|
|
43
|
+
batch_size=1000,
|
|
44
|
+
naming_convention="snake_case",
|
|
45
|
+
)
|
|
46
|
+
def elasticsearch_insert(
|
|
47
|
+
items, table, connection_string: str = dlt.secrets.value
|
|
48
|
+
) -> None:
|
|
49
|
+
"""Insert data into Elasticsearch index.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
items: Data items (file path or iterable)
|
|
53
|
+
table: Table metadata containing name and schema info
|
|
54
|
+
connection_string: Elasticsearch connection string
|
|
55
|
+
"""
|
|
56
|
+
# Parse connection string
|
|
57
|
+
parsed = urlparse(connection_string)
|
|
58
|
+
|
|
59
|
+
# Build Elasticsearch client configuration
|
|
60
|
+
actual_url = connection_string
|
|
61
|
+
secure = True # Default to HTTPS (secure by default)
|
|
62
|
+
|
|
63
|
+
if connection_string.startswith("elasticsearch://"):
|
|
64
|
+
actual_url = connection_string.replace("elasticsearch://", "")
|
|
65
|
+
|
|
66
|
+
# Parse to check for query parameters
|
|
67
|
+
temp_parsed = urlparse("http://" + actual_url)
|
|
68
|
+
from urllib.parse import parse_qs
|
|
69
|
+
|
|
70
|
+
query_params = parse_qs(temp_parsed.query)
|
|
71
|
+
|
|
72
|
+
# Check ?secure parameter (defaults to true)
|
|
73
|
+
if "secure" in query_params:
|
|
74
|
+
secure = query_params["secure"][0].lower() in ["true", "1", "yes"]
|
|
75
|
+
|
|
76
|
+
# Remove query params from URL for ES client
|
|
77
|
+
actual_url = actual_url.split("?")[0]
|
|
78
|
+
|
|
79
|
+
# Add scheme
|
|
80
|
+
scheme = "https" if secure else "http"
|
|
81
|
+
actual_url = f"{scheme}://{actual_url}"
|
|
82
|
+
|
|
83
|
+
parsed = urlparse(actual_url)
|
|
84
|
+
|
|
85
|
+
es_config: Dict[str, Any] = {
|
|
86
|
+
"hosts": [actual_url],
|
|
87
|
+
"verify_certs": secure,
|
|
88
|
+
"ssl_show_warn": False,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Add authentication if present
|
|
92
|
+
if parsed.username and parsed.password:
|
|
93
|
+
es_config["http_auth"] = (parsed.username, parsed.password)
|
|
94
|
+
|
|
95
|
+
# Get index name from table metadata
|
|
96
|
+
index_name = table["name"]
|
|
97
|
+
|
|
98
|
+
# Connect to Elasticsearch
|
|
99
|
+
client = Elasticsearch(**es_config)
|
|
100
|
+
|
|
101
|
+
# Replace mode: delete existing index if it exists
|
|
102
|
+
if client.indices.exists(index=index_name):
|
|
103
|
+
client.indices.delete(index=index_name)
|
|
104
|
+
|
|
105
|
+
# Process and insert documents
|
|
106
|
+
if isinstance(items, str):
|
|
107
|
+
documents = process_file_items(items)
|
|
108
|
+
else:
|
|
109
|
+
documents = process_iterable_items(items)
|
|
110
|
+
|
|
111
|
+
# Prepare documents for bulk insert as generator
|
|
112
|
+
def doc_generator():
|
|
113
|
+
for doc in documents:
|
|
114
|
+
es_doc: Dict[str, Any] = {"_index": index_name, "_source": doc.copy()}
|
|
115
|
+
|
|
116
|
+
# Use _id if present, otherwise let ES generate one
|
|
117
|
+
if "_id" in doc:
|
|
118
|
+
es_doc["_id"] = str(doc["_id"])
|
|
119
|
+
# Remove _id from source since it's metadata
|
|
120
|
+
if "_id" in es_doc["_source"]:
|
|
121
|
+
del es_doc["_source"]["_id"]
|
|
122
|
+
elif "id" in doc:
|
|
123
|
+
es_doc["_id"] = str(doc["id"])
|
|
124
|
+
|
|
125
|
+
yield es_doc
|
|
126
|
+
|
|
127
|
+
# Bulk insert
|
|
128
|
+
try:
|
|
129
|
+
_, failed_items = bulk(client, doc_generator(), request_timeout=60)
|
|
130
|
+
if failed_items:
|
|
131
|
+
failed_count = (
|
|
132
|
+
len(failed_items) if isinstance(failed_items, list) else failed_items
|
|
133
|
+
)
|
|
134
|
+
raise Exception(
|
|
135
|
+
f"Failed to insert {failed_count} documents: {failed_items}"
|
|
136
|
+
)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
raise Exception(f"Elasticsearch bulk insert failed: {str(e)}")
|
ingestr/src/errors.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
class MissingValueError(Exception):
|
|
2
5
|
def __init__(self, value, source):
|
|
3
6
|
super().__init__(f"{value} is required to connect to {source}")
|
|
@@ -16,3 +19,8 @@ class InvalidBlobTableError(Exception):
|
|
|
16
19
|
f"Invalid source table for {source} "
|
|
17
20
|
"Ensure that the table is in the format {bucket-name}/{file glob}"
|
|
18
21
|
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HTTPError(Exception):
|
|
25
|
+
def __init__(self, source: requests.HTTPError):
|
|
26
|
+
super().__init__(f"HTTP {source.response.status_code}: {source.response.text}")
|
|
@@ -4,6 +4,7 @@ from typing import Iterator, Sequence
|
|
|
4
4
|
|
|
5
5
|
import dlt
|
|
6
6
|
from dlt.common import pendulum
|
|
7
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
7
8
|
from dlt.common.typing import TDataItems
|
|
8
9
|
from dlt.sources import DltResource
|
|
9
10
|
from facebook_business.adobjects.ad import Ad
|
|
@@ -12,7 +13,6 @@ from .helpers import (
|
|
|
12
13
|
execute_job,
|
|
13
14
|
get_ads_account,
|
|
14
15
|
get_data_chunked,
|
|
15
|
-
get_start_date,
|
|
16
16
|
process_report_item,
|
|
17
17
|
)
|
|
18
18
|
from .settings import (
|
|
@@ -22,13 +22,8 @@ from .settings import (
|
|
|
22
22
|
DEFAULT_ADCREATIVE_FIELDS,
|
|
23
23
|
DEFAULT_ADSET_FIELDS,
|
|
24
24
|
DEFAULT_CAMPAIGN_FIELDS,
|
|
25
|
-
DEFAULT_INSIGHT_FIELDS,
|
|
26
25
|
DEFAULT_LEAD_FIELDS,
|
|
27
26
|
INSIGHT_FIELDS_TYPES,
|
|
28
|
-
INSIGHTS_BREAKDOWNS_OPTIONS,
|
|
29
|
-
INSIGHTS_PRIMARY_KEY,
|
|
30
|
-
INVALID_INSIGHTS_FIELDS,
|
|
31
|
-
TInsightsBreakdownOptions,
|
|
32
27
|
TInsightsLevels,
|
|
33
28
|
)
|
|
34
29
|
|
|
@@ -106,16 +101,20 @@ def facebook_insights_source(
|
|
|
106
101
|
account_id: str = dlt.config.value,
|
|
107
102
|
access_token: str = dlt.secrets.value,
|
|
108
103
|
initial_load_past_days: int = 1,
|
|
109
|
-
|
|
110
|
-
|
|
104
|
+
dimensions: Sequence[str] = None,
|
|
105
|
+
fields: Sequence[str] = None,
|
|
111
106
|
time_increment_days: int = 1,
|
|
112
|
-
breakdowns: TInsightsBreakdownOptions = "ads_insights",
|
|
113
107
|
action_breakdowns: Sequence[str] = ALL_ACTION_BREAKDOWNS,
|
|
114
108
|
level: TInsightsLevels = "ad",
|
|
115
109
|
action_attribution_windows: Sequence[str] = ALL_ACTION_ATTRIBUTION_WINDOWS,
|
|
116
110
|
batch_size: int = 50,
|
|
117
111
|
request_timeout: int = 300,
|
|
118
112
|
app_api_version: str = None,
|
|
113
|
+
start_date: pendulum.DateTime | None = None,
|
|
114
|
+
end_date: pendulum.DateTime | None = None,
|
|
115
|
+
insights_max_wait_to_finish_seconds: int = 60 * 60 * 4,
|
|
116
|
+
insights_max_wait_to_start_seconds: int = 60 * 30,
|
|
117
|
+
insights_max_async_sleep_seconds: int = 20,
|
|
119
118
|
) -> DltResource:
|
|
120
119
|
"""Incrementally loads insight reports with defined granularity level, fields, breakdowns etc.
|
|
121
120
|
|
|
@@ -148,40 +147,54 @@ def facebook_insights_source(
|
|
|
148
147
|
account_id, access_token, request_timeout, app_api_version
|
|
149
148
|
)
|
|
150
149
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
150
|
+
if start_date is None:
|
|
151
|
+
start_date = pendulum.today().subtract(days=initial_load_past_days)
|
|
152
|
+
|
|
153
|
+
if dimensions is None:
|
|
154
|
+
dimensions = []
|
|
155
|
+
if fields is None:
|
|
156
|
+
fields = []
|
|
157
|
+
|
|
158
|
+
columns = {}
|
|
159
|
+
for field in fields:
|
|
160
|
+
if field in INSIGHT_FIELDS_TYPES:
|
|
161
|
+
columns[field] = INSIGHT_FIELDS_TYPES[field]
|
|
154
162
|
|
|
155
163
|
@dlt.resource(
|
|
156
|
-
primary_key=INSIGHTS_PRIMARY_KEY,
|
|
157
164
|
write_disposition="merge",
|
|
158
|
-
|
|
165
|
+
merge_key="date_start",
|
|
166
|
+
columns=columns,
|
|
159
167
|
)
|
|
160
168
|
def facebook_insights(
|
|
161
169
|
date_start: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
162
170
|
"date_start",
|
|
163
|
-
initial_value=
|
|
171
|
+
initial_value=ensure_pendulum_datetime(start_date).start_of("day").date(),
|
|
172
|
+
end_value=ensure_pendulum_datetime(end_date).end_of("day").date()
|
|
173
|
+
if end_date
|
|
174
|
+
else None,
|
|
164
175
|
range_end="closed",
|
|
165
176
|
range_start="closed",
|
|
166
177
|
),
|
|
167
178
|
) -> Iterator[TDataItems]:
|
|
168
|
-
start_date =
|
|
169
|
-
|
|
179
|
+
start_date = date_start.last_value
|
|
180
|
+
if date_start.end_value:
|
|
181
|
+
end_date_val = pendulum.instance(date_start.end_value)
|
|
182
|
+
|
|
183
|
+
end_date = (
|
|
184
|
+
end_date_val
|
|
185
|
+
if isinstance(end_date_val, pendulum.Date)
|
|
186
|
+
else end_date_val.date()
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
end_date = pendulum.now().date()
|
|
170
190
|
|
|
171
|
-
# fetch insights in incremental day steps
|
|
172
191
|
while start_date <= end_date:
|
|
173
192
|
query = {
|
|
174
193
|
"level": level,
|
|
175
194
|
"action_breakdowns": list(action_breakdowns),
|
|
176
|
-
"breakdowns":
|
|
177
|
-
INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["breakdowns"]
|
|
178
|
-
),
|
|
195
|
+
"breakdowns": dimensions,
|
|
179
196
|
"limit": batch_size,
|
|
180
|
-
"fields":
|
|
181
|
-
set(fields)
|
|
182
|
-
.union(INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["fields"])
|
|
183
|
-
.difference(INVALID_INSIGHTS_FIELDS)
|
|
184
|
-
),
|
|
197
|
+
"fields": fields,
|
|
185
198
|
"time_increment": time_increment_days,
|
|
186
199
|
"action_attribution_windows": list(action_attribution_windows),
|
|
187
200
|
"time_ranges": [
|
|
@@ -193,8 +206,14 @@ def facebook_insights_source(
|
|
|
193
206
|
}
|
|
194
207
|
],
|
|
195
208
|
}
|
|
196
|
-
job = execute_job(
|
|
197
|
-
|
|
209
|
+
job = execute_job(
|
|
210
|
+
account.get_insights(params=query, is_async=True),
|
|
211
|
+
insights_max_async_sleep_seconds=insights_max_async_sleep_seconds,
|
|
212
|
+
insights_max_wait_to_finish_seconds=insights_max_wait_to_finish_seconds,
|
|
213
|
+
insights_max_wait_to_start_seconds=insights_max_wait_to_start_seconds,
|
|
214
|
+
)
|
|
215
|
+
output = list(map(process_report_item, job.get_result()))
|
|
216
|
+
yield output
|
|
198
217
|
start_date = start_date.add(days=time_increment_days)
|
|
199
218
|
|
|
200
219
|
return facebook_insights
|
|
@@ -3,14 +3,13 @@
|
|
|
3
3
|
import functools
|
|
4
4
|
import itertools
|
|
5
5
|
import time
|
|
6
|
+
from datetime import datetime
|
|
6
7
|
from typing import Any, Iterator, Sequence
|
|
7
8
|
|
|
8
|
-
import dlt
|
|
9
9
|
import humanize
|
|
10
10
|
import pendulum
|
|
11
11
|
from dlt.common import logger
|
|
12
12
|
from dlt.common.configuration.inject import with_config
|
|
13
|
-
from dlt.common.time import ensure_pendulum_datetime
|
|
14
13
|
from dlt.common.typing import DictStrAny, TDataItem, TDataItems
|
|
15
14
|
from dlt.sources.helpers import requests
|
|
16
15
|
from dlt.sources.helpers.requests import Client
|
|
@@ -23,49 +22,21 @@ from facebook_business.api import FacebookResponse
|
|
|
23
22
|
|
|
24
23
|
from .exceptions import InsightsJobTimeout
|
|
25
24
|
from .settings import (
|
|
26
|
-
FACEBOOK_INSIGHTS_RETENTION_PERIOD,
|
|
27
25
|
INSIGHTS_PRIMARY_KEY,
|
|
28
26
|
TFbMethod,
|
|
29
27
|
)
|
|
30
28
|
|
|
31
29
|
|
|
32
|
-
def get_start_date(
|
|
33
|
-
incremental_start_date: dlt.sources.incremental[str],
|
|
34
|
-
attribution_window_days_lag: int = 7,
|
|
35
|
-
) -> pendulum.DateTime:
|
|
36
|
-
"""
|
|
37
|
-
Get the start date for incremental loading of Facebook Insights data.
|
|
38
|
-
"""
|
|
39
|
-
start_date: pendulum.DateTime = ensure_pendulum_datetime(
|
|
40
|
-
incremental_start_date.start_value
|
|
41
|
-
).subtract(days=attribution_window_days_lag)
|
|
42
|
-
|
|
43
|
-
# facebook forgets insights so trim the lag and warn
|
|
44
|
-
min_start_date = pendulum.today().subtract(
|
|
45
|
-
months=FACEBOOK_INSIGHTS_RETENTION_PERIOD
|
|
46
|
-
)
|
|
47
|
-
if start_date < min_start_date:
|
|
48
|
-
logger.warning(
|
|
49
|
-
"%s: Start date is earlier than %s months ago, using %s instead. "
|
|
50
|
-
"For more information, see https://www.facebook.com/business/help/1695754927158071?id=354406972049255",
|
|
51
|
-
"facebook_insights",
|
|
52
|
-
FACEBOOK_INSIGHTS_RETENTION_PERIOD,
|
|
53
|
-
min_start_date,
|
|
54
|
-
)
|
|
55
|
-
start_date = min_start_date
|
|
56
|
-
incremental_start_date.start_value = min_start_date
|
|
57
|
-
|
|
58
|
-
# lag the incremental start date by attribution window lag
|
|
59
|
-
incremental_start_date.start_value = start_date.isoformat()
|
|
60
|
-
return start_date
|
|
61
|
-
|
|
62
|
-
|
|
63
30
|
def process_report_item(item: AbstractObject) -> DictStrAny:
|
|
31
|
+
if "date_start" in item:
|
|
32
|
+
item["date_start"] = datetime.strptime(item["date_start"], "%Y-%m-%d").date()
|
|
33
|
+
if "date_stop" in item:
|
|
34
|
+
item["date_stop"] = datetime.strptime(item["date_stop"], "%Y-%m-%d").date()
|
|
35
|
+
|
|
64
36
|
d: DictStrAny = item.export_all_data()
|
|
65
37
|
for pki in INSIGHTS_PRIMARY_KEY:
|
|
66
38
|
if pki not in d:
|
|
67
39
|
d[pki] = "no_" + pki
|
|
68
|
-
|
|
69
40
|
return d
|
|
70
41
|
|
|
71
42
|
|
|
@@ -138,17 +109,22 @@ def execute_job(
|
|
|
138
109
|
) -> AbstractCrudObject:
|
|
139
110
|
status: str = None
|
|
140
111
|
time_start = time.time()
|
|
141
|
-
sleep_time =
|
|
112
|
+
sleep_time = 3
|
|
142
113
|
while status != "Job Completed":
|
|
114
|
+
print("-----")
|
|
115
|
+
print("waiting for job to finish")
|
|
143
116
|
duration = time.time() - time_start
|
|
144
117
|
job = job.api_get()
|
|
145
118
|
status = job["async_status"]
|
|
146
119
|
percent_complete = job["async_percent_completion"]
|
|
120
|
+
print("async_status", status)
|
|
121
|
+
print("percent_complete", percent_complete)
|
|
147
122
|
|
|
148
123
|
job_id = job["id"]
|
|
149
124
|
logger.info("%s, %d%% done", status, percent_complete)
|
|
150
125
|
|
|
151
126
|
if status == "Job Completed":
|
|
127
|
+
print("job completed")
|
|
152
128
|
return job
|
|
153
129
|
|
|
154
130
|
if duration > insights_max_wait_to_start_seconds and percent_complete == 0:
|
|
@@ -168,7 +144,7 @@ def execute_job(
|
|
|
168
144
|
raise InsightsJobTimeout(
|
|
169
145
|
"facebook_insights",
|
|
170
146
|
pretty_error_message.format(
|
|
171
|
-
job_id, insights_max_wait_to_finish_seconds
|
|
147
|
+
job_id, insights_max_wait_to_finish_seconds
|
|
172
148
|
),
|
|
173
149
|
)
|
|
174
150
|
|
|
@@ -253,3 +229,49 @@ def notify_on_token_expiration(access_token_expires_at: int = None) -> None:
|
|
|
253
229
|
logger.error(
|
|
254
230
|
f"Access Token expires in {humanize.precisedelta(pendulum.now() - expires_at)}. Replace the token now!"
|
|
255
231
|
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def parse_insights_table_to_source_kwargs(table: str) -> DictStrAny:
|
|
235
|
+
import typing
|
|
236
|
+
|
|
237
|
+
from ingestr.src.facebook_ads.settings import (
|
|
238
|
+
INSIGHTS_BREAKDOWNS_OPTIONS,
|
|
239
|
+
TInsightsBreakdownOptions,
|
|
240
|
+
TInsightsLevels,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
parts = table.split(":")
|
|
244
|
+
|
|
245
|
+
source_kwargs = {}
|
|
246
|
+
|
|
247
|
+
breakdown_type = parts[1]
|
|
248
|
+
|
|
249
|
+
valid_breakdowns = list(typing.get_args(TInsightsBreakdownOptions))
|
|
250
|
+
if breakdown_type in valid_breakdowns:
|
|
251
|
+
dimensions = INSIGHTS_BREAKDOWNS_OPTIONS[breakdown_type]["breakdowns"]
|
|
252
|
+
fields = INSIGHTS_BREAKDOWNS_OPTIONS[breakdown_type]["fields"]
|
|
253
|
+
source_kwargs["dimensions"] = dimensions
|
|
254
|
+
source_kwargs["fields"] = fields
|
|
255
|
+
else:
|
|
256
|
+
dimensions = breakdown_type.split(",")
|
|
257
|
+
valid_levels = list(typing.get_args(TInsightsLevels))
|
|
258
|
+
level = None
|
|
259
|
+
for valid_level in reversed(valid_levels):
|
|
260
|
+
if valid_level in dimensions:
|
|
261
|
+
level = valid_level
|
|
262
|
+
dimensions.remove(valid_level)
|
|
263
|
+
break
|
|
264
|
+
|
|
265
|
+
source_kwargs["level"] = level
|
|
266
|
+
source_kwargs["dimensions"] = dimensions
|
|
267
|
+
|
|
268
|
+
# If custom metrics are provided, parse them
|
|
269
|
+
if len(parts) == 3:
|
|
270
|
+
fields = [f.strip() for f in parts[2].split(",") if f.strip()]
|
|
271
|
+
if not fields:
|
|
272
|
+
raise ValueError(
|
|
273
|
+
"Custom metrics must be provided after the second colon in format: facebook_insights:breakdown_type:metric1,metric2..."
|
|
274
|
+
)
|
|
275
|
+
source_kwargs["fields"] = fields
|
|
276
|
+
|
|
277
|
+
return source_kwargs
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
from dlt.common.configuration.inject import with_config
|
|
5
|
+
from dlt.sources.helpers import requests
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@with_config(sections=("sources", "facebook_ads"))
|
|
9
|
+
def debug_access_token(
|
|
10
|
+
access_token: str = dlt.secrets.value,
|
|
11
|
+
client_id: str = dlt.secrets.value,
|
|
12
|
+
client_secret: str = dlt.secrets.value,
|
|
13
|
+
) -> str:
|
|
14
|
+
"""Debugs the `access_token` providing info on expiration time, scopes etc. If arguments are not provides, `dlt` will inject them from configuration"""
|
|
15
|
+
debug_url = f"https://graph.facebook.com/debug_token?input_token={access_token}&access_token={client_id}|{client_secret}"
|
|
16
|
+
response = requests.get(debug_url)
|
|
17
|
+
data: Dict[str, str] = response.json()
|
|
18
|
+
|
|
19
|
+
if "error" in data:
|
|
20
|
+
raise Exception(f"Error debugging token: {data['error']}")
|
|
21
|
+
|
|
22
|
+
return data["data"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@with_config(sections=("sources", "facebook_ads"))
|
|
26
|
+
def get_long_lived_token(
|
|
27
|
+
access_token: str = dlt.secrets.value,
|
|
28
|
+
client_id: str = dlt.secrets.value,
|
|
29
|
+
client_secret: str = dlt.secrets.value,
|
|
30
|
+
) -> str:
|
|
31
|
+
"""Gets the long lived access token (60 days) from `access_token`. If arguments are not provides, `dlt` will inject them from configuration"""
|
|
32
|
+
exchange_url = f"https://graph.facebook.com/v13.0/oauth/access_token?grant_type=fb_exchange_token&client_id={client_id}&client_secret={client_secret}&fb_exchange_token={access_token}"
|
|
33
|
+
response = requests.get(exchange_url)
|
|
34
|
+
data: Dict[str, str] = response.json()
|
|
35
|
+
|
|
36
|
+
if "error" in data:
|
|
37
|
+
raise Exception(f"Error refreshing token: {data['error']}")
|
|
38
|
+
|
|
39
|
+
return data["access_token"]
|