mta 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mta/__init__.py +1 -0
- mta/data/data.csv.gz +0 -0
- mta/mta.py +956 -0
- mta-0.0.8.dist-info/METADATA +344 -0
- mta-0.0.8.dist-info/RECORD +7 -0
- mta-0.0.8.dist-info/WHEEL +5 -0
- mta-0.0.8.dist-info/top_level.txt +1 -0
mta/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .mta import MTA
|
mta/data/data.csv.gz
ADDED
|
Binary file
|
mta/mta.py
ADDED
|
@@ -0,0 +1,956 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from itertools import chain, tee, combinations
|
|
3
|
+
from functools import reduce, wraps
|
|
4
|
+
from operator import mul
|
|
5
|
+
from collections import defaultdict, Counter
|
|
6
|
+
import random
|
|
7
|
+
import time
|
|
8
|
+
import numpy as np
|
|
9
|
+
import copy
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
import math
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
from typing import List, Any, Dict, Tuple, DefaultDict, Optional, Union
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
import arrow
|
|
17
|
+
|
|
18
|
+
from sklearn.linear_model import LogisticRegression
|
|
19
|
+
from sklearn.model_selection import train_test_split
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class MTAConfig:
|
|
24
|
+
"""Configuration for MTA model"""
|
|
25
|
+
|
|
26
|
+
allow_loops: bool = False
|
|
27
|
+
add_timepoints: bool = True
|
|
28
|
+
sep: str = " > "
|
|
29
|
+
normalize_by_default: bool = True
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def show_time(func: Callable[..., Any]):
|
|
33
|
+
"""Timer decorator"""
|
|
34
|
+
|
|
35
|
+
@wraps(func)
|
|
36
|
+
def wrapper(*args, **kwargs):
|
|
37
|
+
t0 = time.time()
|
|
38
|
+
print(f"running {func.__name__}.. ", end="")
|
|
39
|
+
sys.stdout.flush()
|
|
40
|
+
|
|
41
|
+
v = func(*args, **kwargs)
|
|
42
|
+
|
|
43
|
+
minutes, seconds = divmod(time.time() - t0, 60)
|
|
44
|
+
st = "elapsed time:"
|
|
45
|
+
if minutes:
|
|
46
|
+
st += f" {minutes:.0f} min"
|
|
47
|
+
if seconds:
|
|
48
|
+
st += f" {seconds:.3f} sec"
|
|
49
|
+
print(st)
|
|
50
|
+
|
|
51
|
+
return v
|
|
52
|
+
|
|
53
|
+
return wrapper
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class MTA:
|
|
57
|
+
"""Multi-Touch Attribution model implementation"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
data: Union[str, pd.DataFrame] = "data.csv.gz",
|
|
62
|
+
config: Optional[MTAConfig] = None,
|
|
63
|
+
**kwargs,
|
|
64
|
+
) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Initialize MTA model
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
data: Path to CSV file or DataFrame
|
|
70
|
+
config: MTAConfig object or use kwargs for individual settings
|
|
71
|
+
"""
|
|
72
|
+
# Setup configuration
|
|
73
|
+
if config is None:
|
|
74
|
+
config = MTAConfig(
|
|
75
|
+
**{
|
|
76
|
+
k: v
|
|
77
|
+
for k, v in kwargs.items()
|
|
78
|
+
if k in MTAConfig.__dataclass_fields__
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
self.config = config
|
|
82
|
+
|
|
83
|
+
# Constants
|
|
84
|
+
self.NULL = "(null)"
|
|
85
|
+
self.START = "(start)"
|
|
86
|
+
self.CONV = "(conversion)"
|
|
87
|
+
|
|
88
|
+
# Load and validate data
|
|
89
|
+
self._load_data(data)
|
|
90
|
+
self._validate_data()
|
|
91
|
+
|
|
92
|
+
# Process data
|
|
93
|
+
if config.add_timepoints:
|
|
94
|
+
self.add_exposure_times()
|
|
95
|
+
if not config.allow_loops:
|
|
96
|
+
self.remove_loops()
|
|
97
|
+
|
|
98
|
+
self._prepare_data()
|
|
99
|
+
self._setup_channels()
|
|
100
|
+
|
|
101
|
+
# Initialize results storage
|
|
102
|
+
self.attribution: Dict[str, Dict[str, float]] = defaultdict(
|
|
103
|
+
lambda: defaultdict(float)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _load_data(self, data: Union[str, pd.DataFrame]) -> None:
|
|
107
|
+
"""Load data from file or DataFrame"""
|
|
108
|
+
if isinstance(data, str):
|
|
109
|
+
data_path = os.path.join(os.path.dirname(__file__), "data", data)
|
|
110
|
+
self.data = pd.read_csv(data_path)
|
|
111
|
+
elif isinstance(data, pd.DataFrame):
|
|
112
|
+
self.data = data.copy()
|
|
113
|
+
else:
|
|
114
|
+
raise TypeError("data must be a file path or pandas DataFrame")
|
|
115
|
+
|
|
116
|
+
def _validate_data(self) -> None:
|
|
117
|
+
"""Validate required columns exist"""
|
|
118
|
+
required_cols = {"path", "total_conversions", "total_null"}
|
|
119
|
+
if not required_cols.issubset(self.data.columns):
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"Data must contain columns: {required_cols}. "
|
|
122
|
+
f"Found: {set(self.data.columns)}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def _prepare_data(self) -> None:
|
|
126
|
+
"""Convert path and exposure_times to lists"""
|
|
127
|
+
self.data[["path", "exposure_times"]] = self.data[
|
|
128
|
+
["path", "exposure_times"]
|
|
129
|
+
].map(lambda x: [ch.strip() for ch in str(x).split(self.config.sep.strip())])
|
|
130
|
+
|
|
131
|
+
def _setup_channels(self) -> None:
|
|
132
|
+
"""Setup channel mappings and indices"""
|
|
133
|
+
self.channels = sorted({ch for ch in chain.from_iterable(self.data["path"])})
|
|
134
|
+
self.channels_ext = [self.START] + self.channels + [self.CONV, self.NULL]
|
|
135
|
+
self.channel_name_to_index = {c: i for i, c in enumerate(self.channels_ext)}
|
|
136
|
+
self.index_to_channel_name = {
|
|
137
|
+
i: c for c, i in self.channel_name_to_index.items()
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
def __repr__(self) -> str:
|
|
141
|
+
return (
|
|
142
|
+
f"{self.__class__.__name__}(channels={len(self.channels)}, "
|
|
143
|
+
f"journeys={len(self.data)})"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def add_exposure_times(self, exposure_every_second: bool = True) -> "MTA":
|
|
147
|
+
"""Generate synthetic exposure times"""
|
|
148
|
+
if "exposure_times" in self.data.columns:
|
|
149
|
+
return self
|
|
150
|
+
|
|
151
|
+
ts = []
|
|
152
|
+
_t0 = arrow.utcnow()
|
|
153
|
+
|
|
154
|
+
if exposure_every_second:
|
|
155
|
+
for path_str in self.data["path"]:
|
|
156
|
+
path_list = [ch.strip() for ch in path_str.split(">")]
|
|
157
|
+
time_range = arrow.Arrow.range(
|
|
158
|
+
"second", _t0, _t0.shift(seconds=len(path_list) - 1)
|
|
159
|
+
)
|
|
160
|
+
ts.append(
|
|
161
|
+
self.config.sep.join(
|
|
162
|
+
[t.format("YYYY-MM-DD HH:mm:ss") for t in time_range]
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
self.data["exposure_times"] = ts
|
|
167
|
+
return self
|
|
168
|
+
|
|
169
|
+
@show_time
|
|
170
|
+
def remove_loops(self) -> "MTA":
|
|
171
|
+
"""Remove consecutive duplicate channels"""
|
|
172
|
+
cleaned_data = []
|
|
173
|
+
|
|
174
|
+
for _, row in self.data.iterrows():
|
|
175
|
+
path = [ch.strip() for ch in str(row["path"]).split(">")]
|
|
176
|
+
exposure = [ch.strip() for ch in str(row["exposure_times"]).split(">")]
|
|
177
|
+
|
|
178
|
+
clean_path, clean_exposure = [], []
|
|
179
|
+
prev_channel = None
|
|
180
|
+
|
|
181
|
+
for ch, exp in zip(path, exposure):
|
|
182
|
+
if ch != prev_channel:
|
|
183
|
+
clean_path.append(ch)
|
|
184
|
+
clean_exposure.append(exp)
|
|
185
|
+
prev_channel = ch
|
|
186
|
+
|
|
187
|
+
cleaned_data.append(
|
|
188
|
+
{
|
|
189
|
+
"path": self.config.sep.join(clean_path),
|
|
190
|
+
"exposure_times": self.config.sep.join(clean_exposure),
|
|
191
|
+
**{
|
|
192
|
+
col: row[col]
|
|
193
|
+
for col in self.data.columns
|
|
194
|
+
if col not in ["path", "exposure_times"]
|
|
195
|
+
},
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
self.data = pd.DataFrame(cleaned_data)
|
|
200
|
+
|
|
201
|
+
# Aggregate duplicate paths
|
|
202
|
+
agg_dict = {
|
|
203
|
+
col: "sum"
|
|
204
|
+
for col in self.data.columns
|
|
205
|
+
if col not in ["path", "exposure_times"]
|
|
206
|
+
}
|
|
207
|
+
agg_dict["exposure_times"] = "first"
|
|
208
|
+
|
|
209
|
+
self.data = self.data.groupby("path", as_index=False).agg(agg_dict)
|
|
210
|
+
return self
|
|
211
|
+
|
|
212
|
+
@staticmethod
|
|
213
|
+
def normalize_dict(
|
|
214
|
+
dict_: Dict[Any, float], decimal_digits: int = 6
|
|
215
|
+
) -> Dict[Any, float]:
|
|
216
|
+
"""Normalize dictionary values to sum to 1"""
|
|
217
|
+
total = sum(dict_.values())
|
|
218
|
+
return (
|
|
219
|
+
{k: round(v / total, decimal_digits) for k, v in dict_.items()}
|
|
220
|
+
if total
|
|
221
|
+
else dict_
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def _apply_normalization(
|
|
225
|
+
self, result: Dict[str, float], normalize: bool
|
|
226
|
+
) -> Dict[str, float]:
|
|
227
|
+
"""Apply normalization if requested"""
|
|
228
|
+
return self.normalize_dict(result) if normalize else result
|
|
229
|
+
|
|
230
|
+
@show_time
|
|
231
|
+
def linear(self, share: str = "same", normalize: bool = True) -> "MTA":
|
|
232
|
+
"""
|
|
233
|
+
Linear attribution model
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
share: 'same' for equal distribution or 'proportional' for weighted
|
|
237
|
+
normalize: Whether to normalize results
|
|
238
|
+
"""
|
|
239
|
+
if share not in ["same", "proportional"]:
|
|
240
|
+
raise ValueError("share must be 'same' or 'proportional'")
|
|
241
|
+
|
|
242
|
+
linear = defaultdict(float)
|
|
243
|
+
|
|
244
|
+
for row in self.data.itertuples():
|
|
245
|
+
if not row.total_conversions:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
if share == "same":
|
|
249
|
+
unique_channels = set(row.path)
|
|
250
|
+
credit_per_channel = row.total_conversions / len(unique_channels)
|
|
251
|
+
for c in unique_channels:
|
|
252
|
+
linear[c] += credit_per_channel
|
|
253
|
+
else: # proportional
|
|
254
|
+
channel_counts = Counter(row.path)
|
|
255
|
+
total_touches = sum(channel_counts.values())
|
|
256
|
+
for c, count in channel_counts.items():
|
|
257
|
+
linear[c] += row.total_conversions * (count / total_touches)
|
|
258
|
+
|
|
259
|
+
linear = self._apply_normalization(dict(linear), normalize)
|
|
260
|
+
self.attribution["linear"] = linear
|
|
261
|
+
return self
|
|
262
|
+
|
|
263
|
+
@show_time
|
|
264
|
+
def position_based(
|
|
265
|
+
self, first_weight: float = 40, last_weight: float = 40, normalize: bool = True
|
|
266
|
+
) -> "MTA":
|
|
267
|
+
"""
|
|
268
|
+
Position-based attribution
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
first_weight: Percentage credit for first touch (0-100)
|
|
272
|
+
last_weight: Percentage credit for last touch (0-100)
|
|
273
|
+
normalize: Whether to normalize results
|
|
274
|
+
"""
|
|
275
|
+
if first_weight + last_weight > 100:
|
|
276
|
+
raise ValueError("Sum of first and last weights cannot exceed 100")
|
|
277
|
+
|
|
278
|
+
position_based = defaultdict(float)
|
|
279
|
+
|
|
280
|
+
for row in self.data.itertuples():
|
|
281
|
+
if not row.total_conversions:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
path_len = len(row.path)
|
|
285
|
+
|
|
286
|
+
if path_len == 1:
|
|
287
|
+
position_based[row.path[0]] += row.total_conversions
|
|
288
|
+
elif path_len == 2:
|
|
289
|
+
credit = row.total_conversions / 2
|
|
290
|
+
position_based[row.path[0]] += credit
|
|
291
|
+
position_based[row.path[-1]] += credit
|
|
292
|
+
else:
|
|
293
|
+
position_based[row.path[0]] += (
|
|
294
|
+
first_weight * row.total_conversions / 100
|
|
295
|
+
)
|
|
296
|
+
position_based[row.path[-1]] += (
|
|
297
|
+
last_weight * row.total_conversions / 100
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
middle_credit = (
|
|
301
|
+
(100 - first_weight - last_weight) * row.total_conversions / 100
|
|
302
|
+
)
|
|
303
|
+
middle_channels = row.path[1:-1]
|
|
304
|
+
credit_per_middle = middle_credit / len(middle_channels)
|
|
305
|
+
|
|
306
|
+
for c in middle_channels:
|
|
307
|
+
position_based[c] += credit_per_middle
|
|
308
|
+
|
|
309
|
+
position_based = self._apply_normalization(dict(position_based), normalize)
|
|
310
|
+
self.attribution["pos_based"] = position_based
|
|
311
|
+
return self
|
|
312
|
+
|
|
313
|
+
@show_time
|
|
314
|
+
def time_decay(
|
|
315
|
+
self, count_direction: str = "left", normalize: bool = True
|
|
316
|
+
) -> "MTA":
|
|
317
|
+
"""
|
|
318
|
+
Time decay attribution - channels closer to conversion get more credit
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
count_direction: 'left' (oldest first) or 'right' (newest first)
|
|
322
|
+
normalize: Whether to normalize results
|
|
323
|
+
"""
|
|
324
|
+
if count_direction not in ["left", "right"]:
|
|
325
|
+
raise ValueError("count_direction must be 'left' or 'right'")
|
|
326
|
+
|
|
327
|
+
time_decay = defaultdict(float)
|
|
328
|
+
|
|
329
|
+
for row in self.data.itertuples():
|
|
330
|
+
if not row.total_conversions:
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
# Get unique channels in order of appearance
|
|
334
|
+
seen = []
|
|
335
|
+
path_to_iterate = (
|
|
336
|
+
row.path if count_direction == "left" else reversed(row.path)
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
for c in path_to_iterate:
|
|
340
|
+
if c not in seen:
|
|
341
|
+
seen.append(c)
|
|
342
|
+
|
|
343
|
+
if count_direction == "right":
|
|
344
|
+
seen.reverse()
|
|
345
|
+
|
|
346
|
+
# Assign weights: 1, 2, 3, ... (linear growth)
|
|
347
|
+
total_weight = sum(range(1, len(seen) + 1))
|
|
348
|
+
|
|
349
|
+
for i, c in enumerate(seen, 1):
|
|
350
|
+
time_decay[c] += (i / total_weight) * row.total_conversions
|
|
351
|
+
|
|
352
|
+
time_decay = self._apply_normalization(dict(time_decay), normalize)
|
|
353
|
+
self.attribution["time_decay"] = time_decay
|
|
354
|
+
return self
|
|
355
|
+
|
|
356
|
+
@show_time
|
|
357
|
+
def first_touch(self, normalize: bool = True) -> "MTA":
|
|
358
|
+
"""First-touch attribution model"""
|
|
359
|
+
first_touch = (
|
|
360
|
+
self.data[self.data["total_conversions"] > 0]
|
|
361
|
+
.groupby(self.data["path"].apply(lambda x: x[0]))["total_conversions"]
|
|
362
|
+
.sum()
|
|
363
|
+
.to_dict()
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
first_touch = self._apply_normalization(first_touch, normalize)
|
|
367
|
+
self.attribution["first_touch"] = first_touch
|
|
368
|
+
return self
|
|
369
|
+
|
|
370
|
+
@show_time
|
|
371
|
+
def last_touch(self, normalize: bool = True) -> "MTA":
|
|
372
|
+
"""Last-touch attribution model"""
|
|
373
|
+
last_touch = (
|
|
374
|
+
self.data[self.data["total_conversions"] > 0]
|
|
375
|
+
.groupby(self.data["path"].apply(lambda x: x[-1]))["total_conversions"]
|
|
376
|
+
.sum()
|
|
377
|
+
.to_dict()
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
last_touch = self._apply_normalization(last_touch, normalize)
|
|
381
|
+
self.attribution["last_touch"] = last_touch
|
|
382
|
+
return self
|
|
383
|
+
|
|
384
|
+
@staticmethod
|
|
385
|
+
def pairs(lst: List[Any]) -> zip:
|
|
386
|
+
"""Generate consecutive pairs from list"""
|
|
387
|
+
it1, it2 = tee(lst)
|
|
388
|
+
next(it2, None)
|
|
389
|
+
return zip(it1, it2)
|
|
390
|
+
|
|
391
|
+
def count_pairs(self) -> DefaultDict[Tuple[str, str], int]:
|
|
392
|
+
"""Count channel pair transitions"""
|
|
393
|
+
pair_counts = defaultdict(int)
|
|
394
|
+
|
|
395
|
+
for row in self.data.itertuples():
|
|
396
|
+
# Count transitions along the path
|
|
397
|
+
for pair in self.pairs([self.START] + row.path):
|
|
398
|
+
pair_counts[pair] += row.total_conversions + row.total_null
|
|
399
|
+
|
|
400
|
+
# Add terminal transitions
|
|
401
|
+
pair_counts[(row.path[-1], self.NULL)] += row.total_null
|
|
402
|
+
pair_counts[(row.path[-1], self.CONV)] += row.total_conversions
|
|
403
|
+
|
|
404
|
+
return pair_counts
|
|
405
|
+
|
|
406
|
+
def ordered_tuple(self, t: Tuple[Any, ...]) -> Tuple[Any, ...]:
|
|
407
|
+
"""
|
|
408
|
+
Return tuple t ordered
|
|
409
|
+
Special case: if tuple starts with START, keep it first and sort the rest
|
|
410
|
+
"""
|
|
411
|
+
if len(t) > 1 and t[0] == self.START:
|
|
412
|
+
return (t[0],) + tuple(sorted(t[1:]))
|
|
413
|
+
return tuple(sorted(t))
|
|
414
|
+
|
|
415
|
+
def transition_matrix(self) -> Dict[Tuple[str, str], float]:
|
|
416
|
+
"""Calculate Markov transition probabilities"""
|
|
417
|
+
pair_counts = self.count_pairs()
|
|
418
|
+
|
|
419
|
+
# Calculate outgoing transition totals
|
|
420
|
+
outgoing = defaultdict(int)
|
|
421
|
+
for (from_ch, _), count in pair_counts.items():
|
|
422
|
+
outgoing[from_ch] += count
|
|
423
|
+
|
|
424
|
+
# Calculate probabilities
|
|
425
|
+
return {pair: count / outgoing[pair[0]] for pair, count in pair_counts.items()}
|
|
426
|
+
|
|
427
|
+
@show_time
|
|
428
|
+
def simulate_path(
|
|
429
|
+
self,
|
|
430
|
+
trans_mat: Dict[Tuple[str, str], float],
|
|
431
|
+
drop_channel: Optional[str] = None,
|
|
432
|
+
n: int = int(1e6),
|
|
433
|
+
) -> Dict[str, int]:
|
|
434
|
+
"""Simulate random user journeys using Markov chain"""
|
|
435
|
+
outcome_counts = defaultdict(int)
|
|
436
|
+
|
|
437
|
+
idx_start = self.channel_name_to_index[self.START]
|
|
438
|
+
idx_null = self.channel_name_to_index[self.NULL]
|
|
439
|
+
idx_conv = self.channel_name_to_index[self.CONV]
|
|
440
|
+
idx_drop = self.channel_name_to_index.get(drop_channel, idx_null)
|
|
441
|
+
|
|
442
|
+
for _ in range(n):
|
|
443
|
+
current_idx = idx_start
|
|
444
|
+
|
|
445
|
+
while True:
|
|
446
|
+
# Get transition probabilities from current state
|
|
447
|
+
probs = [
|
|
448
|
+
trans_mat.get(
|
|
449
|
+
(
|
|
450
|
+
self.index_to_channel_name[current_idx],
|
|
451
|
+
self.index_to_channel_name[i],
|
|
452
|
+
),
|
|
453
|
+
0,
|
|
454
|
+
)
|
|
455
|
+
for i in range(len(self.channels_ext))
|
|
456
|
+
]
|
|
457
|
+
|
|
458
|
+
# Choose next state
|
|
459
|
+
next_idx = np.random.choice(len(self.channels_ext), p=probs)
|
|
460
|
+
|
|
461
|
+
if next_idx == idx_conv:
|
|
462
|
+
outcome_counts[self.CONV] += 1
|
|
463
|
+
break
|
|
464
|
+
elif next_idx in {idx_null, idx_drop}:
|
|
465
|
+
outcome_counts[self.NULL] += 1
|
|
466
|
+
break
|
|
467
|
+
else:
|
|
468
|
+
current_idx = next_idx
|
|
469
|
+
|
|
470
|
+
return dict(outcome_counts)
|
|
471
|
+
|
|
472
|
+
def _calculate_path_probability(
|
|
473
|
+
self, path: List[str], trans_mat: Dict[Tuple[str, str], float]
|
|
474
|
+
) -> float:
|
|
475
|
+
"""Calculate probability of a specific path"""
|
|
476
|
+
full_path = [self.START] + path + [self.CONV]
|
|
477
|
+
probs = [trans_mat.get(pair, 0) for pair in self.pairs(full_path)]
|
|
478
|
+
return reduce(mul, probs) if probs else 0
|
|
479
|
+
|
|
480
|
+
def prob_convert(
|
|
481
|
+
self, trans_mat: Dict[Tuple[str, str], float], drop: Optional[str] = None
|
|
482
|
+
) -> float:
|
|
483
|
+
"""Calculate total conversion probability"""
|
|
484
|
+
# Filter data
|
|
485
|
+
if drop:
|
|
486
|
+
mask = ~self.data["path"].apply(lambda x: drop in x) & (
|
|
487
|
+
self.data["total_conversions"] > 0
|
|
488
|
+
)
|
|
489
|
+
else:
|
|
490
|
+
mask = self.data["total_conversions"] > 0
|
|
491
|
+
|
|
492
|
+
filtered_data = self.data[mask]
|
|
493
|
+
|
|
494
|
+
# Sum probabilities across all converting paths
|
|
495
|
+
total_prob = sum(
|
|
496
|
+
self._calculate_path_probability(row.path, trans_mat)
|
|
497
|
+
for row in filtered_data.itertuples()
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
return total_prob
|
|
501
|
+
|
|
502
|
+
@show_time
|
|
503
|
+
def markov(self, sim: bool = False, normalize: bool = True) -> "MTA":
|
|
504
|
+
"""
|
|
505
|
+
Markov chain attribution model
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
sim: Use simulation (True) or analytical calculation (False)
|
|
509
|
+
normalize: Whether to normalize results
|
|
510
|
+
"""
|
|
511
|
+
markov = defaultdict(float)
|
|
512
|
+
trans_mat = self.transition_matrix()
|
|
513
|
+
|
|
514
|
+
if sim:
|
|
515
|
+
# Simulation-based approach
|
|
516
|
+
outcomes_full = self.simulate_path(trans_mat, drop_channel=None)
|
|
517
|
+
|
|
518
|
+
for channel in self.channels:
|
|
519
|
+
outcomes_drop = self.simulate_path(trans_mat, drop_channel=channel)
|
|
520
|
+
markov[channel] = (
|
|
521
|
+
outcomes_full[self.CONV] - outcomes_drop[self.CONV]
|
|
522
|
+
) / outcomes_full[self.CONV]
|
|
523
|
+
else:
|
|
524
|
+
# Analytical approach
|
|
525
|
+
p_base = self.prob_convert(trans_mat)
|
|
526
|
+
|
|
527
|
+
for channel in self.channels:
|
|
528
|
+
p_without = self.prob_convert(trans_mat, drop=channel)
|
|
529
|
+
markov[channel] = (p_base - p_without) / p_base if p_base else 0
|
|
530
|
+
|
|
531
|
+
markov = self._apply_normalization(dict(markov), normalize)
|
|
532
|
+
self.attribution["markov"] = markov
|
|
533
|
+
return self
|
|
534
|
+
|
|
535
|
+
@show_time
|
|
536
|
+
def shao(self, normalize: bool = True) -> "MTA":
|
|
537
|
+
"""
|
|
538
|
+
probabilistic model by Shao and Li (supposed to be equivalent to Shapley); explanation in the original paper may seem rather unclear but
|
|
539
|
+
this https://stats.stackexchange.com/questions/255312/multi-channel-attribution-modelling-using-a-simple-probabilistic-model
|
|
540
|
+
is definitely helpful
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
r = defaultdict(lambda: defaultdict(float))
|
|
544
|
+
|
|
545
|
+
# count user conversions and nulls for each visited channel and channel pair
|
|
546
|
+
|
|
547
|
+
for row in self.data.itertuples():
|
|
548
|
+
|
|
549
|
+
for n in range(1, 3):
|
|
550
|
+
|
|
551
|
+
# # combinations('ABCD', 2) --> AB AC AD BC BD CD
|
|
552
|
+
|
|
553
|
+
for ch in combinations(set(row.path), n):
|
|
554
|
+
|
|
555
|
+
t = self.ordered_tuple(ch)
|
|
556
|
+
|
|
557
|
+
r[t][self.CONV] += row.total_conversions
|
|
558
|
+
r[t][self.NULL] += row.total_null
|
|
559
|
+
|
|
560
|
+
for _ in r:
|
|
561
|
+
r[_]["conv_prob"] = r[_][self.CONV] / (r[_][self.CONV] + r[_][self.NULL])
|
|
562
|
+
|
|
563
|
+
# calculate channel contributions
|
|
564
|
+
|
|
565
|
+
self.C = defaultdict(float)
|
|
566
|
+
|
|
567
|
+
for row in self.data.itertuples():
|
|
568
|
+
|
|
569
|
+
for ch_i in set(row.path):
|
|
570
|
+
|
|
571
|
+
if row.total_conversions:
|
|
572
|
+
|
|
573
|
+
pc = 0 # contribution for current path
|
|
574
|
+
|
|
575
|
+
other_channels = set(row.path) - {ch_i}
|
|
576
|
+
|
|
577
|
+
k = 2 * len(other_channels) if other_channels else 1
|
|
578
|
+
|
|
579
|
+
for ch_j in other_channels:
|
|
580
|
+
|
|
581
|
+
pc += (
|
|
582
|
+
r[self.ordered_tuple((ch_i, ch_j))]["conv_prob"]
|
|
583
|
+
- r[(ch_i,)]["conv_prob"]
|
|
584
|
+
- r[(ch_j,)]["conv_prob"]
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
pc = r[(ch_i,)]["conv_prob"] + pc / k
|
|
588
|
+
|
|
589
|
+
self.C[ch_i] += row.total_conversions * pc
|
|
590
|
+
|
|
591
|
+
if normalize:
|
|
592
|
+
self.C = self.normalize_dict(self.C)
|
|
593
|
+
|
|
594
|
+
self.attribution["shao"] = self.C
|
|
595
|
+
|
|
596
|
+
return self
|
|
597
|
+
|
|
598
|
+
def get_generated_conversions(self, max_subset_size: int = 3) -> "MTA": # FIXED
|
|
599
|
+
|
|
600
|
+
self.cc = defaultdict(lambda: defaultdict(float))
|
|
601
|
+
|
|
602
|
+
for ch_list, convs, nulls in zip(
|
|
603
|
+
self.data["path"], self.data["total_conversions"], self.data["total_null"]
|
|
604
|
+
):
|
|
605
|
+
|
|
606
|
+
# only look at journeys with conversions
|
|
607
|
+
for n in range(1, max_subset_size + 1):
|
|
608
|
+
|
|
609
|
+
for tup in combinations(set(ch_list), n):
|
|
610
|
+
|
|
611
|
+
tup_ = self.ordered_tuple(tup)
|
|
612
|
+
|
|
613
|
+
self.cc[tup_][self.CONV] += convs
|
|
614
|
+
self.cc[tup_][self.NULL] += nulls
|
|
615
|
+
|
|
616
|
+
return self
|
|
617
|
+
|
|
618
|
+
def v(self, coalition: Tuple[Any, Any]) -> float:
|
|
619
|
+
"""
|
|
620
|
+
total number of conversions generated by all subsets of the coalition;
|
|
621
|
+
coalition is a tuple of channels
|
|
622
|
+
"""
|
|
623
|
+
|
|
624
|
+
s = len(coalition)
|
|
625
|
+
|
|
626
|
+
total_convs = 0
|
|
627
|
+
|
|
628
|
+
for n in range(1, s + 1):
|
|
629
|
+
for tup in combinations(coalition, n):
|
|
630
|
+
tup_ = self.ordered_tuple(tup)
|
|
631
|
+
total_convs += self.cc[tup_][self.CONV]
|
|
632
|
+
|
|
633
|
+
return total_convs
|
|
634
|
+
|
|
635
|
+
def w(self, s, n):
|
|
636
|
+
|
|
637
|
+
# FIXED: Handle edge cases properly
|
|
638
|
+
# Formula: s! * (n - s - 1)! / n!
|
|
639
|
+
if s >= n or s < 0:
|
|
640
|
+
return 0
|
|
641
|
+
return math.factorial(s) * math.factorial(n - s - 1) / math.factorial(n)
|
|
642
|
+
|
|
643
|
+
@show_time
|
|
644
|
+
def shapley(self, max_coalition_size: int = 2, normalize: bool = True) -> "MTA":
|
|
645
|
+
"""
|
|
646
|
+
Shapley value attribution
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
max_coalition_size: Maximum size of coalitions to consider
|
|
650
|
+
normalize: Whether to normalize results
|
|
651
|
+
"""
|
|
652
|
+
self.get_generated_conversions(max_subset_size=3)
|
|
653
|
+
|
|
654
|
+
shapley = defaultdict(float)
|
|
655
|
+
n_channels = len(self.channels)
|
|
656
|
+
|
|
657
|
+
for channel in self.channels:
|
|
658
|
+
other_channels = set(self.channels) - {channel}
|
|
659
|
+
|
|
660
|
+
# Consider all subset sizes up to max_coalition_size
|
|
661
|
+
for subset_size in range(
|
|
662
|
+
1, min(max_coalition_size + 1, len(other_channels) + 1)
|
|
663
|
+
):
|
|
664
|
+
for coalition in combinations(other_channels, subset_size):
|
|
665
|
+
marginal_contribution = self.v(coalition + (channel,)) - self.v(
|
|
666
|
+
coalition
|
|
667
|
+
)
|
|
668
|
+
weight = self.w(len(coalition), n_channels)
|
|
669
|
+
shapley[channel] += marginal_contribution * weight
|
|
670
|
+
|
|
671
|
+
shapley = self._apply_normalization(dict(shapley), normalize)
|
|
672
|
+
self.attribution["shapley"] = shapley
|
|
673
|
+
return self
|
|
674
|
+
|
|
675
|
+
@show_time
|
|
676
|
+
def logistic_regression(
|
|
677
|
+
self,
|
|
678
|
+
test_size: float = 0.25,
|
|
679
|
+
sample_rows: float = 0.5,
|
|
680
|
+
sample_features: float = 0.5,
|
|
681
|
+
normalize: bool = True,
|
|
682
|
+
n_iterations: int = 2000,
|
|
683
|
+
) -> "MTA":
|
|
684
|
+
"""
|
|
685
|
+
Logistic regression attribution using ensemble approach
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
test_size: Proportion for test set
|
|
689
|
+
sample_rows: Proportion of rows to sample each iteration
|
|
690
|
+
sample_features: Proportion of features to sample each iteration
|
|
691
|
+
normalize: Whether to normalize results
|
|
692
|
+
n_iterations: Number of bootstrap iterations
|
|
693
|
+
"""
|
|
694
|
+
# Build feature matrix
|
|
695
|
+
records = []
|
|
696
|
+
for row in self.data.itertuples():
|
|
697
|
+
channel_set = {c: 1 for c in row.path}
|
|
698
|
+
|
|
699
|
+
for _ in range(row.total_conversions):
|
|
700
|
+
records.append({**channel_set, "conv": 1})
|
|
701
|
+
|
|
702
|
+
for _ in range(row.total_null):
|
|
703
|
+
records.append({**channel_set, "conv": 0})
|
|
704
|
+
|
|
705
|
+
df = pd.DataFrame(records).fillna(0).sample(frac=1.0, random_state=42)
|
|
706
|
+
|
|
707
|
+
# Ensemble learning
|
|
708
|
+
coef_sum = defaultdict(float)
|
|
709
|
+
|
|
710
|
+
for i in range(n_iterations):
|
|
711
|
+
# Balanced sampling
|
|
712
|
+
df_conv = df[df["conv"] == 1].sample(frac=0.5, random_state=i)
|
|
713
|
+
df_null = df[df["conv"] == 0].sample(frac=0.5, random_state=i)
|
|
714
|
+
df_sample = pd.concat([df_conv, df_null]).sample(
|
|
715
|
+
frac=sample_rows, random_state=i
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Feature sampling
|
|
719
|
+
feature_cols = (
|
|
720
|
+
df_sample.drop("conv", axis=1)
|
|
721
|
+
.sample(frac=sample_features, axis=1, random_state=i)
|
|
722
|
+
.columns
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
X = df_sample[feature_cols]
|
|
726
|
+
y = df_sample["conv"]
|
|
727
|
+
|
|
728
|
+
# Train model
|
|
729
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
730
|
+
X, y, test_size=test_size, random_state=i, stratify=y
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
clf = LogisticRegression(
|
|
734
|
+
random_state=i, solver="lbfgs", fit_intercept=False, max_iter=1000
|
|
735
|
+
).fit(X_train, y_train)
|
|
736
|
+
|
|
737
|
+
# Accumulate coefficients
|
|
738
|
+
for channel, coef in zip(X_train.columns, clf.coef_[0]):
|
|
739
|
+
coef_sum[channel] += abs(coef)
|
|
740
|
+
|
|
741
|
+
# Average coefficients
|
|
742
|
+
lr_attribution = {ch: coef / n_iterations for ch, coef in coef_sum.items()}
|
|
743
|
+
|
|
744
|
+
lr_attribution = self._apply_normalization(lr_attribution, normalize)
|
|
745
|
+
self.attribution["linreg"] = lr_attribution
|
|
746
|
+
return self
|
|
747
|
+
|
|
748
|
+
def show(self, channels: Optional[List[str]] = None) -> pd.DataFrame:
|
|
749
|
+
"""
|
|
750
|
+
Display attribution results
|
|
751
|
+
|
|
752
|
+
Args:
|
|
753
|
+
channels: Specific channels to show (None for all)
|
|
754
|
+
|
|
755
|
+
Returns:
|
|
756
|
+
DataFrame with attribution results
|
|
757
|
+
"""
|
|
758
|
+
df = pd.DataFrame.from_dict(self.attribution)
|
|
759
|
+
|
|
760
|
+
if channels:
|
|
761
|
+
df = df.loc[df.index.isin(channels)]
|
|
762
|
+
|
|
763
|
+
# Sort by index (channel name)
|
|
764
|
+
df = df.sort_index()
|
|
765
|
+
|
|
766
|
+
print("\nAttribution Results:")
|
|
767
|
+
print("=" * 80)
|
|
768
|
+
print(df.to_string())
|
|
769
|
+
print("=" * 80)
|
|
770
|
+
|
|
771
|
+
return df
|
|
772
|
+
|
|
773
|
+
def compare_models(self) -> pd.DataFrame:
|
|
774
|
+
"""Compare all attribution models side by side"""
|
|
775
|
+
df = self.show()
|
|
776
|
+
|
|
777
|
+
# Add summary statistics
|
|
778
|
+
print("\nModel Statistics:")
|
|
779
|
+
print(df.describe())
|
|
780
|
+
|
|
781
|
+
return df
|
|
782
|
+
|
|
783
|
+
def export_results(self, filepath: str, format: str = "csv") -> None:
|
|
784
|
+
"""
|
|
785
|
+
Export attribution results
|
|
786
|
+
|
|
787
|
+
Args:
|
|
788
|
+
filepath: Output file path
|
|
789
|
+
format: 'csv', 'json', or 'excel'
|
|
790
|
+
"""
|
|
791
|
+
df = pd.DataFrame.from_dict(self.attribution)
|
|
792
|
+
|
|
793
|
+
if format == "csv":
|
|
794
|
+
df.to_csv(filepath)
|
|
795
|
+
elif format == "json":
|
|
796
|
+
df.to_json(filepath, orient="index", indent=2)
|
|
797
|
+
elif format == "excel":
|
|
798
|
+
df.to_excel(filepath)
|
|
799
|
+
else:
|
|
800
|
+
raise ValueError(f"Unsupported format: {format}")
|
|
801
|
+
|
|
802
|
+
print(f"Results exported to {filepath}")
|
|
803
|
+
|
|
804
|
+
def pi(
|
|
805
|
+
self, path, exposure_times, conv_flag, beta_by_channel, omega_by_channel
|
|
806
|
+
) -> Dict[str, float]:
|
|
807
|
+
"""
|
|
808
|
+
|
|
809
|
+
calculate contribution of channel i to conversion of journey (user) u - (p_i^u) in the paper
|
|
810
|
+
|
|
811
|
+
- path is a list of states that includes (start) but EXCLUDES (null) or (conversion)
|
|
812
|
+
- exposure_times is list of exposure times
|
|
813
|
+
|
|
814
|
+
"""
|
|
815
|
+
|
|
816
|
+
p = {c: 0 for c in path} # contributions by channel
|
|
817
|
+
|
|
818
|
+
# all contributions are zero if no conversion
|
|
819
|
+
if not conv_flag:
|
|
820
|
+
return p
|
|
821
|
+
|
|
822
|
+
dts = [
|
|
823
|
+
(arrow.get(exposure_times[-1]) - arrow.get(t)).seconds
|
|
824
|
+
for t in exposure_times
|
|
825
|
+
]
|
|
826
|
+
|
|
827
|
+
_ = defaultdict(float)
|
|
828
|
+
|
|
829
|
+
for c, dt in zip(path, dts):
|
|
830
|
+
_[c] += (
|
|
831
|
+
beta_by_channel[c]
|
|
832
|
+
* omega_by_channel[c]
|
|
833
|
+
* np.exp(-omega_by_channel[c] * dt)
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
for c in _:
|
|
837
|
+
p[c] = _[c] / sum(_.values())
|
|
838
|
+
|
|
839
|
+
return p
|
|
840
|
+
|
|
841
|
+
def update_coefs(self, beta: float, omega: float) -> Tuple[float, float, float]:
|
|
842
|
+
"""
|
|
843
|
+
return updated beta and omega
|
|
844
|
+
"""
|
|
845
|
+
|
|
846
|
+
delta = 1e-3
|
|
847
|
+
|
|
848
|
+
beta_num = defaultdict(float)
|
|
849
|
+
beta_den = defaultdict(float)
|
|
850
|
+
omega_den = defaultdict(float)
|
|
851
|
+
|
|
852
|
+
for u, row in enumerate(self.data.itertuples()):
|
|
853
|
+
|
|
854
|
+
p = self.pi(
|
|
855
|
+
row.path, row.exposure_times, row.total_conversions, beta, omega
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
r = copy.deepcopy(row.path)
|
|
859
|
+
|
|
860
|
+
dts = [
|
|
861
|
+
(arrow.get(row.exposure_times[-1]) - arrow.get(t)).seconds
|
|
862
|
+
for t in row.exposure_times
|
|
863
|
+
]
|
|
864
|
+
|
|
865
|
+
while r:
|
|
866
|
+
|
|
867
|
+
# pick channels starting from the last one
|
|
868
|
+
c = r.pop()
|
|
869
|
+
dt = dts.pop()
|
|
870
|
+
|
|
871
|
+
beta_den[c] += 1.0 - np.exp(-omega[c] * dt)
|
|
872
|
+
omega_den[c] += p[c] * dt + beta[c] * dt * np.exp(-omega[c] * dt)
|
|
873
|
+
|
|
874
|
+
beta_num[c] += p[c]
|
|
875
|
+
|
|
876
|
+
# now that we gone through every user, update coefficients for every channel
|
|
877
|
+
|
|
878
|
+
beta0 = copy.deepcopy(beta)
|
|
879
|
+
omega0 = copy.deepcopy(omega)
|
|
880
|
+
|
|
881
|
+
df = []
|
|
882
|
+
|
|
883
|
+
for c in self.channels:
|
|
884
|
+
|
|
885
|
+
beta_num[c] = (beta_num[c] > 1e-6) * beta_num[c]
|
|
886
|
+
beta_den[c] = (beta_den[c] > 1e-6) * beta_den[c]
|
|
887
|
+
omega_den[c] = max(omega_den[c], 1e-6)
|
|
888
|
+
|
|
889
|
+
if beta_den[c]:
|
|
890
|
+
beta[c] = beta_num[c] / beta_den[c]
|
|
891
|
+
|
|
892
|
+
omega[c] = beta_num[c] / omega_den[c]
|
|
893
|
+
|
|
894
|
+
df.append(abs(beta[c] - beta0[c]) < delta)
|
|
895
|
+
df.append(abs(omega[c] - omega0[c]) < delta)
|
|
896
|
+
|
|
897
|
+
return (beta, omega, sum(df))
|
|
898
|
+
|
|
899
|
+
@show_time
|
|
900
|
+
def additive_hazard(
|
|
901
|
+
self, epochs: int = 20, normalize: bool = True
|
|
902
|
+
) -> "MTA": # FIXED
|
|
903
|
+
"""
|
|
904
|
+
additive hazard model as in Multi-Touch Attribution in On-line Advertising with Survival Theory
|
|
905
|
+
"""
|
|
906
|
+
|
|
907
|
+
beta = {c: random.uniform(0.001, 1) for c in self.channels}
|
|
908
|
+
omega = {c: random.uniform(0.001, 1) for c in self.channels}
|
|
909
|
+
|
|
910
|
+
for _ in range(epochs):
|
|
911
|
+
|
|
912
|
+
beta, omega, h = self.update_coefs(beta, omega)
|
|
913
|
+
|
|
914
|
+
if h == 2 * len(self.channels):
|
|
915
|
+
print(f"converged after {_ + 1} iterations")
|
|
916
|
+
break
|
|
917
|
+
|
|
918
|
+
# time window: take the max time instant across all journeys that converged
|
|
919
|
+
|
|
920
|
+
additive_hazard = defaultdict(float)
|
|
921
|
+
|
|
922
|
+
for u, row in enumerate(self.data.itertuples()):
|
|
923
|
+
|
|
924
|
+
p = self.pi(
|
|
925
|
+
row.path, row.exposure_times, row.total_conversions, beta, omega
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
for c in p:
|
|
929
|
+
# FIXED: Weight by actual conversions
|
|
930
|
+
additive_hazard[c] += p[c] * row.total_conversions
|
|
931
|
+
|
|
932
|
+
if normalize:
|
|
933
|
+
additive_hazard = self.normalize_dict(additive_hazard)
|
|
934
|
+
|
|
935
|
+
self.attribution["add_haz"] = additive_hazard
|
|
936
|
+
|
|
937
|
+
return self
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
if __name__ == "__main__":
|
|
941
|
+
|
|
942
|
+
mta = MTA(data="data.csv.gz", allow_loops=False)
|
|
943
|
+
|
|
944
|
+
(
|
|
945
|
+
mta.linear(share="proportional")
|
|
946
|
+
.time_decay(count_direction="right")
|
|
947
|
+
.shapley()
|
|
948
|
+
.shao()
|
|
949
|
+
.first_touch()
|
|
950
|
+
.position_based()
|
|
951
|
+
.last_touch()
|
|
952
|
+
.markov(sim=False)
|
|
953
|
+
.logistic_regression()
|
|
954
|
+
.additive_hazard()
|
|
955
|
+
.show()
|
|
956
|
+
)
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mta
|
|
3
|
+
Version: 0.0.8
|
|
4
|
+
Summary: Multi-Touch Attribution Models for Marketing Analytics
|
|
5
|
+
Home-page: https://github.com/eeghor/mta
|
|
6
|
+
Author: Igor Korostil
|
|
7
|
+
Author-email: Igor Korostil <eeghor@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/eeghor/mta
|
|
10
|
+
Project-URL: Issues, https://github.com/eeghor/mta/issues
|
|
11
|
+
Keywords: attribution,marketing,multi-touch,analytics,markov,shapley
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: pandas>=1.3.0
|
|
24
|
+
Requires-Dist: numpy>=1.20.0
|
|
25
|
+
Requires-Dist: scikit-learn>=0.24.0
|
|
26
|
+
Requires-Dist: arrow>=1.0.0
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: requires-python
|
|
30
|
+
|
|
31
|
+
# Multi-Touch Attribution (MTA)
|
|
32
|
+
|
|
33
|
+
A comprehensive Python library for multi-touch attribution modeling in marketing analytics. This library implements various attribution models to help marketers understand the contribution of different touchpoints in the customer journey.
|
|
34
|
+
|
|
35
|
+
## ๐ฏ Features
|
|
36
|
+
|
|
37
|
+
### Attribution Models Implemented
|
|
38
|
+
|
|
39
|
+
- **First Touch**: 100% credit to the first interaction
|
|
40
|
+
- **Last Touch**: 100% credit to the last interaction before conversion
|
|
41
|
+
- **Linear**: Equal credit distribution across all touchpoints
|
|
42
|
+
- **Position-Based (U-Shaped)**: Customizable weights for first/last touch with remaining credit distributed to middle touches
|
|
43
|
+
- **Time Decay**: Higher credit to more recent touchpoints
|
|
44
|
+
- **Markov Chain**: Probabilistic model using transition matrices
|
|
45
|
+
- **Shapley Value**: Game-theoretic fair allocation based on marginal contributions
|
|
46
|
+
- **Shao's Model**: Probabilistic Shapley-equivalent approach
|
|
47
|
+
- **Logistic Regression**: Machine learning-based ensemble attribution
|
|
48
|
+
- **Additive Hazard**: Survival analysis-based attribution
|
|
49
|
+
|
|
50
|
+
## ๐ฆ Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install mta
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Or install from source:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
git clone https://github.com/eeghor/mta.git
|
|
60
|
+
cd mta
|
|
61
|
+
pip install -e .
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## ๐ Quick Start
|
|
65
|
+
|
|
66
|
+
### Basic Usage
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from mta import MTA
|
|
70
|
+
|
|
71
|
+
# Initialize with your data
|
|
72
|
+
mta = MTA(data="your_data.csv", allow_loops=False, add_timepoints=True)
|
|
73
|
+
|
|
74
|
+
# Run a single attribution model
|
|
75
|
+
mta.linear(share="proportional", normalize=True)
|
|
76
|
+
mta.show()
|
|
77
|
+
|
|
78
|
+
# Chain multiple models
|
|
79
|
+
(mta.linear(share="proportional")
|
|
80
|
+
.time_decay(count_direction="right")
|
|
81
|
+
.markov(sim=False)
|
|
82
|
+
.shapley()
|
|
83
|
+
.show())
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Using Configuration
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from mta import MTA, MTAConfig
|
|
90
|
+
|
|
91
|
+
# Create custom configuration
|
|
92
|
+
config = MTAConfig(
|
|
93
|
+
allow_loops=False,
|
|
94
|
+
add_timepoints=True,
|
|
95
|
+
sep=" > ",
|
|
96
|
+
normalize_by_default=True
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
mta = MTA(data="data.csv", config=config)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Working with DataFrames
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
import pandas as pd
|
|
106
|
+
from mta import MTA
|
|
107
|
+
|
|
108
|
+
# Load your data
|
|
109
|
+
df = pd.read_csv("customer_journeys.csv")
|
|
110
|
+
|
|
111
|
+
# Initialize MTA with DataFrame
|
|
112
|
+
mta = MTA(data=df, allow_loops=False)
|
|
113
|
+
|
|
114
|
+
# Run attribution models
|
|
115
|
+
mta.first_touch().last_touch().linear().show()
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## ๐ Data Format
|
|
119
|
+
|
|
120
|
+
Your input data should be a CSV file or pandas DataFrame with the following columns:
|
|
121
|
+
|
|
122
|
+
```
|
|
123
|
+
path,total_conversions,total_null,exposure_times
|
|
124
|
+
alpha > beta > gamma,10,5,2023-01-01 10:00:00 > 2023-01-01 11:00:00 > 2023-01-01 12:00:00
|
|
125
|
+
beta > gamma,5,3,2023-01-02 09:00:00 > 2023-01-02 10:00:00
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
**Required Columns:**
|
|
129
|
+
|
|
130
|
+
- `path`: Customer journey as channel names separated by `>` (or custom separator)
|
|
131
|
+
- `total_conversions`: Number of conversions for this path
|
|
132
|
+
- `total_null`: Number of non-conversions for this path
|
|
133
|
+
- `exposure_times`: Timestamps of channel exposures (optional, can be auto-generated)
|
|
134
|
+
|
|
135
|
+
## ๐จ Advanced Usage
|
|
136
|
+
|
|
137
|
+
### Position-Based Attribution with Custom Weights
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
# Give 30% to first touch, 30% to last touch, 40% distributed to middle
|
|
141
|
+
mta.position_based(first_weight=30, last_weight=30, normalize=True)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Time Decay with Direction Control
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
# Count from left (earliest gets lowest credit)
|
|
148
|
+
mta.time_decay(count_direction="left")
|
|
149
|
+
|
|
150
|
+
# Count from right (latest gets highest credit - more common)
|
|
151
|
+
mta.time_decay(count_direction="right")
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Markov Chain Attribution
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
# Analytical calculation (faster)
|
|
158
|
+
mta.markov(sim=False, normalize=True)
|
|
159
|
+
|
|
160
|
+
# Simulation-based (more flexible, handles complex scenarios)
|
|
161
|
+
mta.markov(sim=True, normalize=True)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Shapley Value Attribution
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
# With custom coalition size
|
|
168
|
+
mta.shapley(max_coalition_size=3, normalize=True)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Logistic Regression Ensemble
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
# Custom sampling and iteration parameters
|
|
175
|
+
mta.logistic_regression(
|
|
176
|
+
test_size=0.25,
|
|
177
|
+
sample_rows=0.5,
|
|
178
|
+
sample_features=0.5,
|
|
179
|
+
n_iterations=1000,
|
|
180
|
+
normalize=True
|
|
181
|
+
)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Export Results
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
# Compare all models
|
|
188
|
+
results_df = mta.compare_models()
|
|
189
|
+
|
|
190
|
+
# Export to various formats
|
|
191
|
+
mta.export_results("attribution_results.csv", format="csv")
|
|
192
|
+
mta.export_results("attribution_results.json", format="json")
|
|
193
|
+
mta.export_results("attribution_results.xlsx", format="excel")
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## ๐ Example: Complete Analysis Pipeline
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from mta import MTA
|
|
200
|
+
import pandas as pd
|
|
201
|
+
|
|
202
|
+
# Load data
|
|
203
|
+
mta = MTA(
|
|
204
|
+
data="customer_journeys.csv",
|
|
205
|
+
allow_loops=False, # Remove consecutive duplicate channels
|
|
206
|
+
add_timepoints=True # Auto-generate timestamps if missing
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Run all heuristic models
|
|
210
|
+
(mta
|
|
211
|
+
.first_touch()
|
|
212
|
+
.last_touch()
|
|
213
|
+
.linear(share="proportional")
|
|
214
|
+
.position_based(first_weight=40, last_weight=40)
|
|
215
|
+
.time_decay(count_direction="right"))
|
|
216
|
+
|
|
217
|
+
# Run algorithmic models
|
|
218
|
+
(mta
|
|
219
|
+
.markov(sim=False)
|
|
220
|
+
.shapley(max_coalition_size=2)
|
|
221
|
+
.shao()
|
|
222
|
+
.logistic_regression(n_iterations=2000)
|
|
223
|
+
.additive_hazard(epochs=20))
|
|
224
|
+
|
|
225
|
+
# Display and export results
|
|
226
|
+
results = mta.compare_models()
|
|
227
|
+
mta.export_results("full_attribution_analysis.csv")
|
|
228
|
+
|
|
229
|
+
# Access specific model results
|
|
230
|
+
print(f"Markov Attribution: {mta.attribution['markov']}")
|
|
231
|
+
print(f"Shapley Attribution: {mta.attribution['shapley']}")
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## ๐ฌ Model Comparison
|
|
235
|
+
|
|
236
|
+
| Model | Type | Strengths | Use Case |
|
|
237
|
+
| ------------------- | ---------------- | ------------------------ | ---------------------------- |
|
|
238
|
+
| First/Last Touch | Heuristic | Simple, fast | Quick baseline |
|
|
239
|
+
| Linear | Heuristic | Fair, interpretable | Equal value assumption |
|
|
240
|
+
| Position-Based | Heuristic | Balances first/last | Awareness + conversion focus |
|
|
241
|
+
| Time Decay | Heuristic | Recency-weighted | When recent matters more |
|
|
242
|
+
| Markov Chain | Algorithmic | Considers path structure | Sequential dependency |
|
|
243
|
+
| Shapley Value | Algorithmic | Game-theoretic fairness | Complex interactions |
|
|
244
|
+
| Logistic Regression | Machine Learning | Data-driven | Large datasets |
|
|
245
|
+
| Additive Hazard | Statistical | Time-to-event modeling | Survival analysis fans |
|
|
246
|
+
|
|
247
|
+
## ๐ ๏ธ Requirements
|
|
248
|
+
|
|
249
|
+
- Python >= 3.8
|
|
250
|
+
- pandas >= 1.3.0
|
|
251
|
+
- numpy >= 1.20.0
|
|
252
|
+
- scikit-learn >= 0.24.0
|
|
253
|
+
- arrow >= 1.0.0
|
|
254
|
+
|
|
255
|
+
## ๐ Citation
|
|
256
|
+
|
|
257
|
+
If you use this library in your research, please cite:
|
|
258
|
+
|
|
259
|
+
```bibtex
|
|
260
|
+
@software{mta2024,
|
|
261
|
+
author = {Igor Korostil},
|
|
262
|
+
title = {MTA: Multi-Touch Attribution Library},
|
|
263
|
+
year = {2024},
|
|
264
|
+
url = {https://github.com/eeghor/mta}
|
|
265
|
+
}
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
## ๐ References
|
|
269
|
+
|
|
270
|
+
This library implements models and techniques from the following research papers:
|
|
271
|
+
|
|
272
|
+
1. **Nisar, T. M., & Yeung, M. (2015)**
|
|
273
|
+
_Purchase Conversions and Attribution Modeling in Online Advertising: An Empirical Investigation_
|
|
274
|
+
[PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2612997)
|
|
275
|
+
|
|
276
|
+
2. **Shao, X., & Li, L. (2011)**
|
|
277
|
+
_Data-driven Multi-touch Attribution Models_
|
|
278
|
+
Proceedings of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining
|
|
279
|
+
[PDF](https://dl.acm.org/doi/10.1145/2020408.2020453)
|
|
280
|
+
|
|
281
|
+
3. **Dalessandro, B., Perlich, C., Stitelman, O., & Provost, F. (2012)**
|
|
282
|
+
_Causally Motivated Attribution for Online Advertising_
|
|
283
|
+
Proceedings of the Sixth International Workshop on Data Mining for Online Advertising
|
|
284
|
+
[PDF](https://dl.acm.org/doi/10.1145/2351356.2351363)
|
|
285
|
+
|
|
286
|
+
4. **Cano-Berlanga, S., Gimรฉnez-Gรณmez, J. M., & Vilella, C. (2017)**
|
|
287
|
+
_Attribution Models and the Cooperative Game Theory_
|
|
288
|
+
Expert Systems with Applications, 87, 277-286
|
|
289
|
+
[PDF](https://www.sciencedirect.com/science/article/abs/pii/S0957417417304505)
|
|
290
|
+
|
|
291
|
+
5. **Ren, K., Fang, Y., Zhang, W., Liu, S., Li, J., Zhang, Y., Yu, Y., & Wang, J. (2018)**
|
|
292
|
+
_Learning Multi-touch Conversion Attribution with Dual-attention Mechanisms for Online Advertising_
|
|
293
|
+
Proceedings of the 27th ACM International Conference on Information and Knowledge Management
|
|
294
|
+
[PDF](https://dl.acm.org/doi/10.1145/3269206.3271676)
|
|
295
|
+
|
|
296
|
+
6. **Zhang, Y., Wei, Y., & Ren, J. (2014)**
|
|
297
|
+
_Multi-Touch Attribution in Online Advertising with Survival Theory_
|
|
298
|
+
2014 IEEE International Conference on Data Mining
|
|
299
|
+
[PDF](https://ieeexplore.ieee.org/document/7023387)
|
|
300
|
+
|
|
301
|
+
7. **Geyik, S. C., Saxena, A., & Dasdan, A. (2014)**
|
|
302
|
+
_Multi-Touch Attribution Based Budget Allocation in Online Advertising_
|
|
303
|
+
Proceedings of the 8th International Workshop on Data Mining for Online Advertising
|
|
304
|
+
[PDF](https://dl.acm.org/doi/10.1145/2648584.2648586)
|
|
305
|
+
|
|
306
|
+
### Model-to-Paper Mapping
|
|
307
|
+
|
|
308
|
+
- **Linear & Position-Based**: Baseline models referenced across multiple papers
|
|
309
|
+
- **Time Decay**: Nisar & Yeung (2015), Zhang et al. (2014)
|
|
310
|
+
- **Markov Chain**: Shao & Li (2011), Dalessandro et al. (2012)
|
|
311
|
+
- **Shapley Value**: Cano-Berlanga et al. (2017)
|
|
312
|
+
- **Logistic Regression**: Dalessandro et al. (2012), Ren et al. (2018)
|
|
313
|
+
- **Additive Hazard**: Zhang et al. (2014)
|
|
314
|
+
|
|
315
|
+
## ๐ค Contributing
|
|
316
|
+
|
|
317
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
|
318
|
+
|
|
319
|
+
1. Fork the repository
|
|
320
|
+
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
|
321
|
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
|
322
|
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
|
323
|
+
5. Open a Pull Request
|
|
324
|
+
|
|
325
|
+
## ๐ License
|
|
326
|
+
|
|
327
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
328
|
+
|
|
329
|
+
## ๐ Acknowledgments
|
|
330
|
+
|
|
331
|
+
- Inspired by various academic papers on marketing attribution
|
|
332
|
+
- Built with pandas, numpy, and scikit-learn
|
|
333
|
+
- Special thanks to the open-source community
|
|
334
|
+
|
|
335
|
+
## ๐ง Contact
|
|
336
|
+
|
|
337
|
+
Igor Korostil - eeghor@gmail.com
|
|
338
|
+
|
|
339
|
+
Project Link: [https://github.com/eeghor/mta](https://github.com/eeghor/mta)
|
|
340
|
+
|
|
341
|
+
## ๐ Known Issues
|
|
342
|
+
|
|
343
|
+
- Shapley value computation can be slow for large numbers of channels
|
|
344
|
+
- Additive hazard model requires evenly-spaced time points for best results
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
mta/__init__.py,sha256=KuXyr5J0rZ1snuaB2kxabi7JIUBcwVeTYCN_d_JomVw,20
|
|
2
|
+
mta/mta.py,sha256=dIfwLBiJbxQ4CfdZod99C1MUsuzsflNJ2JYrrNTAzK0,30740
|
|
3
|
+
mta/data/data.csv.gz,sha256=zN9K0IRC4NAyNMbFAlxoKCgBG_1qtynxG7g3um_ctKg,82453
|
|
4
|
+
mta-0.0.8.dist-info/METADATA,sha256=7shXR-jjNI4MTvui9YvZV8zLNQYOMpnr5sdKj2kWxV4,10962
|
|
5
|
+
mta-0.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
mta-0.0.8.dist-info/top_level.txt,sha256=yGDzRyCpTtg9Np96GTL1VIgYL8Oya3v2G07vFR8SxAk,4
|
|
7
|
+
mta-0.0.8.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mta
|