ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin/__init__.py +262 -0
- ingestr/src/applovin_max/__init__.py +117 -0
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +38 -11
- ingestr/src/buildinfo.py +1 -0
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +520 -33
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +116 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/loader.py +69 -0
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/personio/__init__.py +331 -0
- ingestr/src/personio/helpers.py +86 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +156 -0
- ingestr/src/salesforce/helpers.py +64 -0
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +3132 -212
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/version.py +6 -1
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- ingestr-0.14.104.dist-info/METADATA +563 -0
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.2.dist-info/METADATA +0 -302
- ingestr-0.13.2.dist-info/RECORD +0 -107
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/masking.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import hmac
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
import string
|
|
6
|
+
import uuid
|
|
7
|
+
from datetime import date, datetime, timedelta
|
|
8
|
+
from typing import Any, Callable, Dict, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MaskingEngine:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.token_cache: Dict[str, Union[str, int]] = {}
|
|
14
|
+
self.sequential_counter = 0
|
|
15
|
+
|
|
16
|
+
def parse_mask_config(self, config: str) -> Tuple[str, str, Optional[str]]:
|
|
17
|
+
parts = config.split(":")
|
|
18
|
+
if len(parts) == 2:
|
|
19
|
+
return parts[0], parts[1], None
|
|
20
|
+
elif len(parts) == 3:
|
|
21
|
+
return parts[0], parts[1], parts[2]
|
|
22
|
+
else:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
f"Invalid mask configuration: {config}. Expected format: 'column:algorithm[:param]'"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def get_masking_function(
|
|
28
|
+
self, algorithm: str, param: Optional[str] = None
|
|
29
|
+
) -> Callable:
|
|
30
|
+
algorithm = algorithm.lower()
|
|
31
|
+
|
|
32
|
+
# Hash-based masking
|
|
33
|
+
if algorithm == "hash" or algorithm == "sha256":
|
|
34
|
+
return self._hash_sha256
|
|
35
|
+
elif algorithm == "md5":
|
|
36
|
+
return self._hash_md5
|
|
37
|
+
elif algorithm == "hmac":
|
|
38
|
+
return lambda x: self._hash_hmac(x, param or "default-key")
|
|
39
|
+
|
|
40
|
+
# Format-preserving masking
|
|
41
|
+
elif algorithm == "email":
|
|
42
|
+
return self._mask_email
|
|
43
|
+
elif algorithm == "phone":
|
|
44
|
+
return self._mask_phone
|
|
45
|
+
elif algorithm == "credit_card":
|
|
46
|
+
return self._mask_credit_card
|
|
47
|
+
elif algorithm == "ssn":
|
|
48
|
+
return self._mask_ssn
|
|
49
|
+
|
|
50
|
+
# Redaction strategies
|
|
51
|
+
elif algorithm == "redact":
|
|
52
|
+
return lambda x: "REDACTED"
|
|
53
|
+
elif algorithm == "stars":
|
|
54
|
+
return lambda x: "*" * len(str(x)) if x else ""
|
|
55
|
+
elif algorithm == "fixed":
|
|
56
|
+
return lambda x: param or "MASKED"
|
|
57
|
+
elif algorithm == "random":
|
|
58
|
+
return self._random_replace
|
|
59
|
+
|
|
60
|
+
# Partial masking
|
|
61
|
+
elif algorithm == "partial":
|
|
62
|
+
chars = int(param) if param else 2
|
|
63
|
+
return lambda x: self._partial_mask(x, chars)
|
|
64
|
+
elif algorithm == "first_letter":
|
|
65
|
+
return self._first_letter_mask
|
|
66
|
+
|
|
67
|
+
# Tokenization
|
|
68
|
+
elif algorithm == "uuid":
|
|
69
|
+
return self._tokenize_uuid
|
|
70
|
+
elif algorithm == "sequential":
|
|
71
|
+
return self._tokenize_sequential
|
|
72
|
+
|
|
73
|
+
# Numeric masking
|
|
74
|
+
elif algorithm == "round":
|
|
75
|
+
precision = int(param) if param else 10
|
|
76
|
+
return lambda x: self._round_number(x, precision)
|
|
77
|
+
elif algorithm == "range":
|
|
78
|
+
bucket_size = int(param) if param else 100
|
|
79
|
+
return lambda x: self._range_mask(x, bucket_size)
|
|
80
|
+
elif algorithm == "noise":
|
|
81
|
+
noise_level = float(param) if param else 0.1
|
|
82
|
+
return lambda x: self._add_noise(x, noise_level)
|
|
83
|
+
|
|
84
|
+
# Date masking
|
|
85
|
+
elif algorithm == "date_shift":
|
|
86
|
+
max_days = int(param) if param else 30
|
|
87
|
+
return lambda x: self._date_shift(x, max_days)
|
|
88
|
+
elif algorithm == "year_only":
|
|
89
|
+
return self._year_only
|
|
90
|
+
elif algorithm == "month_year":
|
|
91
|
+
return self._month_year
|
|
92
|
+
|
|
93
|
+
else:
|
|
94
|
+
raise ValueError(f"Unknown masking algorithm: {algorithm}")
|
|
95
|
+
|
|
96
|
+
# Hash functions
|
|
97
|
+
def _hash_sha256(self, value: Any) -> Optional[str]:
|
|
98
|
+
if value is None:
|
|
99
|
+
return None
|
|
100
|
+
return hashlib.sha256(str(value).encode()).hexdigest()
|
|
101
|
+
|
|
102
|
+
def _hash_md5(self, value: Any) -> Optional[str]:
|
|
103
|
+
if value is None:
|
|
104
|
+
return None
|
|
105
|
+
return hashlib.md5(str(value).encode()).hexdigest()
|
|
106
|
+
|
|
107
|
+
def _hash_hmac(self, value: Any, key: str) -> Optional[str]:
|
|
108
|
+
if value is None:
|
|
109
|
+
return None
|
|
110
|
+
return hmac.new(key.encode(), str(value).encode(), hashlib.sha256).hexdigest()
|
|
111
|
+
|
|
112
|
+
# Format-preserving masks
|
|
113
|
+
def _mask_email(self, value: Any) -> Any:
|
|
114
|
+
if value is None or not value:
|
|
115
|
+
return value
|
|
116
|
+
email_str = str(value)
|
|
117
|
+
if "@" not in email_str:
|
|
118
|
+
return self._partial_mask(email_str, 2)
|
|
119
|
+
|
|
120
|
+
local, domain = email_str.split("@", 1)
|
|
121
|
+
if len(local) <= 2:
|
|
122
|
+
masked_local = "*" * len(local)
|
|
123
|
+
else:
|
|
124
|
+
masked_local = local[0] + "*" * (len(local) - 2) + local[-1]
|
|
125
|
+
return f"{masked_local}@{domain}"
|
|
126
|
+
|
|
127
|
+
def _mask_phone(self, value: Any) -> Any:
|
|
128
|
+
if value is None or not value:
|
|
129
|
+
return value
|
|
130
|
+
phone_str = re.sub(r"\D", "", str(value))
|
|
131
|
+
if len(phone_str) < 10:
|
|
132
|
+
return "*" * len(phone_str)
|
|
133
|
+
|
|
134
|
+
# Keep country code and area code, mask the rest
|
|
135
|
+
if len(phone_str) >= 10:
|
|
136
|
+
return phone_str[:3] + "-***-****"
|
|
137
|
+
return phone_str
|
|
138
|
+
|
|
139
|
+
def _mask_credit_card(self, value: Any) -> Any:
|
|
140
|
+
if value is None or not value:
|
|
141
|
+
return value
|
|
142
|
+
cc_str = re.sub(r"\D", "", str(value))
|
|
143
|
+
if len(cc_str) < 12:
|
|
144
|
+
return "*" * len(cc_str)
|
|
145
|
+
return "*" * (len(cc_str) - 4) + cc_str[-4:]
|
|
146
|
+
|
|
147
|
+
def _mask_ssn(self, value: Any) -> Any:
|
|
148
|
+
if value is None or not value:
|
|
149
|
+
return value
|
|
150
|
+
ssn_str = re.sub(r"\D", "", str(value))
|
|
151
|
+
if len(ssn_str) != 9:
|
|
152
|
+
return "*" * len(ssn_str)
|
|
153
|
+
return "***-**-" + ssn_str[-4:]
|
|
154
|
+
|
|
155
|
+
# Partial masking
|
|
156
|
+
def _partial_mask(self, value: Any, chars_to_show: int) -> Any:
|
|
157
|
+
if value is None or not value:
|
|
158
|
+
return value
|
|
159
|
+
val_str = str(value)
|
|
160
|
+
if len(val_str) <= chars_to_show * 2:
|
|
161
|
+
return "*" * len(val_str)
|
|
162
|
+
return (
|
|
163
|
+
val_str[:chars_to_show]
|
|
164
|
+
+ "*" * (len(val_str) - chars_to_show * 2)
|
|
165
|
+
+ val_str[-chars_to_show:]
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def _first_letter_mask(self, value: Any) -> Any:
|
|
169
|
+
if value is None or not value:
|
|
170
|
+
return value
|
|
171
|
+
val_str = str(value)
|
|
172
|
+
if len(val_str) <= 1:
|
|
173
|
+
return val_str
|
|
174
|
+
return val_str[0] + "*" * (len(val_str) - 1)
|
|
175
|
+
|
|
176
|
+
# Random replacement
|
|
177
|
+
def _random_replace(self, value: Any) -> Any:
|
|
178
|
+
if value is None:
|
|
179
|
+
return value
|
|
180
|
+
|
|
181
|
+
if isinstance(value, (int, float)):
|
|
182
|
+
# Generate random number in similar range
|
|
183
|
+
if isinstance(value, int):
|
|
184
|
+
magnitude = len(str(abs(value)))
|
|
185
|
+
return random.randint(10 ** (magnitude - 1), 10**magnitude - 1)
|
|
186
|
+
else:
|
|
187
|
+
return random.uniform(0, abs(value) * 2)
|
|
188
|
+
elif isinstance(value, str):
|
|
189
|
+
# Generate random string of same length
|
|
190
|
+
return "".join(
|
|
191
|
+
random.choices(string.ascii_letters + string.digits, k=len(value))
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
return str(value)
|
|
195
|
+
|
|
196
|
+
# Tokenization
|
|
197
|
+
def _tokenize_uuid(self, value: Any) -> Optional[str]:
|
|
198
|
+
if value is None:
|
|
199
|
+
return None
|
|
200
|
+
val_str = str(value)
|
|
201
|
+
if val_str not in self.token_cache:
|
|
202
|
+
self.token_cache[val_str] = str(uuid.uuid4())
|
|
203
|
+
return str(self.token_cache[val_str])
|
|
204
|
+
|
|
205
|
+
def _tokenize_sequential(self, value: Any) -> Optional[int]:
|
|
206
|
+
if value is None:
|
|
207
|
+
return None
|
|
208
|
+
val_str = str(value)
|
|
209
|
+
if val_str not in self.token_cache:
|
|
210
|
+
self.sequential_counter += 1
|
|
211
|
+
self.token_cache[val_str] = self.sequential_counter
|
|
212
|
+
return int(self.token_cache[val_str])
|
|
213
|
+
|
|
214
|
+
# Numeric masking
|
|
215
|
+
def _round_number(self, value: Any, precision: int) -> Any:
|
|
216
|
+
if value is None:
|
|
217
|
+
return value
|
|
218
|
+
try:
|
|
219
|
+
num = float(value)
|
|
220
|
+
return round(num / precision) * precision
|
|
221
|
+
except (ValueError, TypeError):
|
|
222
|
+
return value
|
|
223
|
+
|
|
224
|
+
def _range_mask(self, value: Any, bucket_size: int) -> Any:
|
|
225
|
+
if value is None:
|
|
226
|
+
return value
|
|
227
|
+
try:
|
|
228
|
+
num = float(value)
|
|
229
|
+
lower = int(num // bucket_size) * bucket_size
|
|
230
|
+
upper = lower + bucket_size
|
|
231
|
+
return f"{lower}-{upper}"
|
|
232
|
+
except (ValueError, TypeError):
|
|
233
|
+
return value
|
|
234
|
+
|
|
235
|
+
def _add_noise(self, value: Any, noise_level: float) -> Any:
|
|
236
|
+
if value is None:
|
|
237
|
+
return value
|
|
238
|
+
try:
|
|
239
|
+
num = float(value)
|
|
240
|
+
noise = random.uniform(-noise_level, noise_level) * abs(num)
|
|
241
|
+
result = num + noise
|
|
242
|
+
if isinstance(value, int):
|
|
243
|
+
return int(result)
|
|
244
|
+
return result
|
|
245
|
+
except (ValueError, TypeError):
|
|
246
|
+
return value
|
|
247
|
+
|
|
248
|
+
# Date masking
|
|
249
|
+
def _date_shift(self, value: Any, max_days: int) -> Any:
|
|
250
|
+
if value is None:
|
|
251
|
+
return value
|
|
252
|
+
|
|
253
|
+
if isinstance(value, (date, datetime)):
|
|
254
|
+
shift_days = random.randint(-max_days, max_days)
|
|
255
|
+
return value + timedelta(days=shift_days)
|
|
256
|
+
|
|
257
|
+
# Try to parse string dates
|
|
258
|
+
try:
|
|
259
|
+
from dateutil import parser # type: ignore
|
|
260
|
+
|
|
261
|
+
dt = parser.parse(str(value))
|
|
262
|
+
shift_days = random.randint(-max_days, max_days)
|
|
263
|
+
result = dt + timedelta(days=shift_days)
|
|
264
|
+
if isinstance(value, str):
|
|
265
|
+
return result.strftime("%Y-%m-%d")
|
|
266
|
+
return result
|
|
267
|
+
except Exception:
|
|
268
|
+
return value
|
|
269
|
+
|
|
270
|
+
def _year_only(self, value: Any) -> Any:
|
|
271
|
+
if value is None:
|
|
272
|
+
return value
|
|
273
|
+
|
|
274
|
+
if isinstance(value, (date, datetime)):
|
|
275
|
+
return value.year
|
|
276
|
+
|
|
277
|
+
# Try to parse string dates
|
|
278
|
+
try:
|
|
279
|
+
from dateutil import parser
|
|
280
|
+
|
|
281
|
+
dt = parser.parse(str(value))
|
|
282
|
+
return dt.year
|
|
283
|
+
except Exception:
|
|
284
|
+
return value
|
|
285
|
+
|
|
286
|
+
def _month_year(self, value: Any) -> Any:
|
|
287
|
+
if value is None:
|
|
288
|
+
return value
|
|
289
|
+
|
|
290
|
+
if isinstance(value, (date, datetime)):
|
|
291
|
+
return f"{value.year}-{value.month:02d}"
|
|
292
|
+
|
|
293
|
+
# Try to parse string dates
|
|
294
|
+
try:
|
|
295
|
+
from dateutil import parser
|
|
296
|
+
|
|
297
|
+
dt = parser.parse(str(value))
|
|
298
|
+
return f"{dt.year}-{dt.month:02d}"
|
|
299
|
+
except Exception:
|
|
300
|
+
return value
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def create_masking_mapper(mask_configs: list[str]) -> Callable:
|
|
304
|
+
engine = MaskingEngine()
|
|
305
|
+
|
|
306
|
+
# Parse all configurations
|
|
307
|
+
masks = {}
|
|
308
|
+
for config in mask_configs:
|
|
309
|
+
column, algorithm, param = engine.parse_mask_config(config)
|
|
310
|
+
masks[column] = engine.get_masking_function(algorithm, param)
|
|
311
|
+
|
|
312
|
+
def apply_masks(data: Any) -> Any:
|
|
313
|
+
# Handle PyArrow tables
|
|
314
|
+
try:
|
|
315
|
+
import pyarrow as pa # type: ignore
|
|
316
|
+
|
|
317
|
+
if isinstance(data, pa.Table):
|
|
318
|
+
# Convert to pandas for easier manipulation
|
|
319
|
+
df = data.to_pandas()
|
|
320
|
+
|
|
321
|
+
# Apply masks to each column
|
|
322
|
+
for column, mask_func in masks.items():
|
|
323
|
+
if column in df.columns:
|
|
324
|
+
df[column] = df[column].apply(mask_func)
|
|
325
|
+
|
|
326
|
+
# Convert back to PyArrow table
|
|
327
|
+
return pa.Table.from_pandas(df)
|
|
328
|
+
except ImportError:
|
|
329
|
+
pass
|
|
330
|
+
|
|
331
|
+
# Handle dictionaries (original behavior)
|
|
332
|
+
if isinstance(data, dict):
|
|
333
|
+
for column, mask_func in masks.items():
|
|
334
|
+
if column in data:
|
|
335
|
+
try:
|
|
336
|
+
data[column] = mask_func(data[column])
|
|
337
|
+
except Exception as e:
|
|
338
|
+
print(f"Warning: Failed to mask column {column}: {e}")
|
|
339
|
+
return data
|
|
340
|
+
|
|
341
|
+
# Return as-is if not a supported type
|
|
342
|
+
return data
|
|
343
|
+
|
|
344
|
+
return apply_masks
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import Iterable
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
import pendulum
|
|
5
|
+
from dlt.common.typing import TDataItem
|
|
6
|
+
from dlt.sources import DltResource
|
|
7
|
+
|
|
8
|
+
from .client import MixpanelClient
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dlt.source(max_table_nesting=0)
|
|
12
|
+
def mixpanel_source(
|
|
13
|
+
username: str,
|
|
14
|
+
password: str,
|
|
15
|
+
project_id: str,
|
|
16
|
+
server: str,
|
|
17
|
+
start_date: pendulum.DateTime,
|
|
18
|
+
end_date: pendulum.DateTime | None = None,
|
|
19
|
+
) -> Iterable[DltResource]:
|
|
20
|
+
client = MixpanelClient(username, password, project_id, server)
|
|
21
|
+
|
|
22
|
+
@dlt.resource(write_disposition="merge", name="events", primary_key="distinct_id")
|
|
23
|
+
def events(
|
|
24
|
+
date=dlt.sources.incremental(
|
|
25
|
+
"time",
|
|
26
|
+
initial_value=start_date.int_timestamp,
|
|
27
|
+
end_value=end_date.int_timestamp if end_date else None,
|
|
28
|
+
range_end="closed",
|
|
29
|
+
range_start="closed",
|
|
30
|
+
),
|
|
31
|
+
) -> Iterable[TDataItem]:
|
|
32
|
+
if date.end_value is None:
|
|
33
|
+
end_dt = pendulum.now(tz="UTC")
|
|
34
|
+
else:
|
|
35
|
+
end_dt = pendulum.from_timestamp(date.end_value)
|
|
36
|
+
|
|
37
|
+
start_dt = pendulum.from_timestamp(date.last_value)
|
|
38
|
+
|
|
39
|
+
yield from client.fetch_events(
|
|
40
|
+
start_dt,
|
|
41
|
+
end_dt,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
@dlt.resource(write_disposition="merge", primary_key="distinct_id", name="profiles")
|
|
45
|
+
def profiles(
|
|
46
|
+
last_seen=dlt.sources.incremental(
|
|
47
|
+
"last_seen",
|
|
48
|
+
initial_value=start_date,
|
|
49
|
+
end_value=end_date,
|
|
50
|
+
range_end="closed",
|
|
51
|
+
range_start="closed",
|
|
52
|
+
),
|
|
53
|
+
) -> Iterable[TDataItem]:
|
|
54
|
+
if last_seen.end_value is None:
|
|
55
|
+
end_dt = pendulum.now(tz="UTC")
|
|
56
|
+
else:
|
|
57
|
+
end_dt = last_seen.end_value
|
|
58
|
+
|
|
59
|
+
start_dt = last_seen.last_value
|
|
60
|
+
yield from client.fetch_profiles(start_dt, end_dt)
|
|
61
|
+
|
|
62
|
+
return events, profiles
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Iterable
|
|
3
|
+
|
|
4
|
+
import pendulum
|
|
5
|
+
from dlt.sources.helpers.requests import Client
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MixpanelClient:
|
|
9
|
+
def __init__(self, username: str, password: str, project_id: str, server: str):
|
|
10
|
+
self.username = username
|
|
11
|
+
self.password = password
|
|
12
|
+
self.project_id = project_id
|
|
13
|
+
self.server = server
|
|
14
|
+
self.session = Client(raise_for_status=False).session
|
|
15
|
+
|
|
16
|
+
def fetch_events(
|
|
17
|
+
self, start_date: pendulum.DateTime, end_date: pendulum.DateTime
|
|
18
|
+
) -> Iterable[dict]:
|
|
19
|
+
if self.server == "us":
|
|
20
|
+
server = "data"
|
|
21
|
+
elif self.server == "in":
|
|
22
|
+
server = "data-in"
|
|
23
|
+
else:
|
|
24
|
+
server = "data-eu"
|
|
25
|
+
|
|
26
|
+
url = f"https://{server}.mixpanel.com/api/2.0/export/"
|
|
27
|
+
params = {
|
|
28
|
+
"project_id": self.project_id,
|
|
29
|
+
"from_date": start_date.format("YYYY-MM-DD"),
|
|
30
|
+
"to_date": end_date.format("YYYY-MM-DD"),
|
|
31
|
+
}
|
|
32
|
+
headers = {
|
|
33
|
+
"accept": "text/plain",
|
|
34
|
+
}
|
|
35
|
+
from requests.auth import HTTPBasicAuth
|
|
36
|
+
|
|
37
|
+
auth = HTTPBasicAuth(self.username, self.password)
|
|
38
|
+
resp = self.session.get(url, params=params, headers=headers, auth=auth)
|
|
39
|
+
resp.raise_for_status()
|
|
40
|
+
for line in resp.iter_lines():
|
|
41
|
+
if line:
|
|
42
|
+
data = json.loads(line.decode())
|
|
43
|
+
if "properties" in data:
|
|
44
|
+
for key, value in data["properties"].items():
|
|
45
|
+
if key.startswith("$"):
|
|
46
|
+
data[key[1:]] = value
|
|
47
|
+
else:
|
|
48
|
+
data[key] = value
|
|
49
|
+
del data["properties"]
|
|
50
|
+
yield data
|
|
51
|
+
|
|
52
|
+
def fetch_profiles(
|
|
53
|
+
self, start_date: pendulum.DateTime, end_date: pendulum.DateTime
|
|
54
|
+
) -> Iterable[dict]:
|
|
55
|
+
if self.server == "us":
|
|
56
|
+
server = ""
|
|
57
|
+
elif self.server == "in":
|
|
58
|
+
server = "in."
|
|
59
|
+
else:
|
|
60
|
+
server = "eu."
|
|
61
|
+
url = f"https://{server}mixpanel.com/api/query/engage"
|
|
62
|
+
headers = {
|
|
63
|
+
"accept": "application/json",
|
|
64
|
+
"content-type": "application/x-www-form-urlencoded",
|
|
65
|
+
}
|
|
66
|
+
from requests.auth import HTTPBasicAuth
|
|
67
|
+
|
|
68
|
+
auth = HTTPBasicAuth(self.username, self.password)
|
|
69
|
+
page = 0
|
|
70
|
+
session_id = None
|
|
71
|
+
while True:
|
|
72
|
+
params = {"project_id": self.project_id, "page": str(page)}
|
|
73
|
+
if session_id:
|
|
74
|
+
params["session_id"] = session_id
|
|
75
|
+
start_str = start_date.format("YYYY-MM-DDTHH:mm:ss")
|
|
76
|
+
end_str = end_date.format("YYYY-MM-DDTHH:mm:ss")
|
|
77
|
+
where = f'properties["$last_seen"] >= "{start_str}" and properties["$last_seen"] <= "{end_str}"'
|
|
78
|
+
params["where"] = where
|
|
79
|
+
resp = self.session.post(url, params=params, headers=headers, auth=auth)
|
|
80
|
+
|
|
81
|
+
resp.raise_for_status()
|
|
82
|
+
data = resp.json()
|
|
83
|
+
|
|
84
|
+
for result in data.get("results", []):
|
|
85
|
+
for key, value in result["$properties"].items():
|
|
86
|
+
if key.startswith("$"):
|
|
87
|
+
if key == "$last_seen":
|
|
88
|
+
result["last_seen"] = pendulum.parse(value)
|
|
89
|
+
else:
|
|
90
|
+
result[key[1:]] = value
|
|
91
|
+
result["distinct_id"] = result["$distinct_id"]
|
|
92
|
+
del result["$properties"]
|
|
93
|
+
del result["$distinct_id"]
|
|
94
|
+
yield result
|
|
95
|
+
if not data.get("results"):
|
|
96
|
+
break
|
|
97
|
+
session_id = data.get("session_id", session_id)
|
|
98
|
+
|
|
99
|
+
page += 1
|