traffic-taffy 0.8.1__tar.gz → 0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/.gitignore +1 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/PKG-INFO +4 -1
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/pyproject.toml +11 -0
- traffic_taffy-0.9/traffic_taffy/__init__.py +1 -0
- traffic_taffy-0.9/traffic_taffy/algorithms/__init__.py +21 -0
- traffic_taffy-0.9/traffic_taffy/algorithms/comparecorrelation.py +164 -0
- traffic_taffy-0.9/traffic_taffy/algorithms/comparecorrelationchanges.py +210 -0
- traffic_taffy-0.9/traffic_taffy/algorithms/compareseries.py +117 -0
- traffic_taffy-0.9/traffic_taffy/algorithms/compareslices.py +116 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/algorithms/statistical.py +9 -9
- traffic_taffy-0.9/traffic_taffy/compare.py +250 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/comparison.py +18 -4
- traffic_taffy-0.9/traffic_taffy/config.py +133 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/dissection.py +171 -6
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/dissectmany.py +26 -16
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/dissector.py +189 -77
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/dissector_engine/scapy.py +41 -8
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/graph.py +54 -53
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/graphdata.py +13 -2
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/hooks/ip2asn.py +20 -7
- traffic_taffy-0.9/traffic_taffy/hooks/labels.py +45 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/hooks/psl.py +21 -3
- traffic_taffy-0.9/traffic_taffy/iana/tables.msgpak +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/output/__init__.py +8 -48
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/output/console.py +37 -25
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/output/fsdb.py +24 -18
- traffic_taffy-0.9/traffic_taffy/reports/__init__.py +5 -0
- traffic_taffy-0.9/traffic_taffy/reports/compareslicesreport.py +85 -0
- traffic_taffy-0.9/traffic_taffy/reports/correlationchangereport.py +54 -0
- traffic_taffy-0.9/traffic_taffy/reports/correlationreport.py +42 -0
- traffic_taffy-0.9/traffic_taffy/taffy_config.py +44 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tests/test_compare_results.py +22 -7
- traffic_taffy-0.9/traffic_taffy/tests/test_config.py +149 -0
- traffic_taffy-0.9/traffic_taffy/tests/test_global_config.py +33 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tests/test_normalize.py +1 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tests/test_pcap_dissector.py +12 -2
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tests/test_pcap_splitter.py +21 -10
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tools/cache_info.py +3 -2
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tools/compare.py +32 -24
- traffic_taffy-0.9/traffic_taffy/tools/config.py +83 -0
- traffic_taffy-0.9/traffic_taffy/tools/dissect.py +111 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tools/explore.py +5 -4
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tools/export.py +28 -17
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tools/graph.py +25 -27
- traffic_taffy-0.8.1/traffic_taffy/__init__.py +0 -1
- traffic_taffy-0.8.1/traffic_taffy/algorithms/__init__.py +0 -14
- traffic_taffy-0.8.1/traffic_taffy/compare.py +0 -260
- traffic_taffy-0.8.1/traffic_taffy/report.py +0 -12
- traffic_taffy-0.8.1/traffic_taffy/tests/test_dpkt_engine.py +0 -15
- traffic_taffy-0.8.1/traffic_taffy/tools/dissect.py +0 -119
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/LICENSE.txt +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/README.md +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/dissector_engine/__init__.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/dissector_engine/dnstap.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/dissector_engine/dpkt.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/hooks/__init__.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/output/memory.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tests/test_dict_merge.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tests/test_hooks.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tests/test_splitter.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tests/test_value_printing.py +0 -0
- {traffic_taffy-0.8.1 → traffic_taffy-0.9}/traffic_taffy/tools/__init__.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: traffic-taffy
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.9
|
4
4
|
Summary: A tool for doing differential analysis of pcap files
|
5
5
|
Project-URL: Homepage, https://traffic-taffy.github.io/
|
6
6
|
Author-email: Wes Hardaker <opensource@hardakers.net>
|
@@ -8,8 +8,10 @@ License-File: LICENSE.txt
|
|
8
8
|
Classifier: Operating System :: OS Independent
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
10
10
|
Requires-Python: >=3.7
|
11
|
+
Requires-Dist: argparse-with-config>=1.1.4
|
11
12
|
Requires-Dist: cryptography
|
12
13
|
Requires-Dist: dnssplitter
|
14
|
+
Requires-Dist: dotnest>=1.0
|
13
15
|
Requires-Dist: dpkt
|
14
16
|
Requires-Dist: ip2asn
|
15
17
|
Requires-Dist: msgpack
|
@@ -19,6 +21,7 @@ Requires-Dist: pyfsdb
|
|
19
21
|
Requires-Dist: pyopenssl==22.1.0
|
20
22
|
Requires-Dist: pyqt6-charts
|
21
23
|
Requires-Dist: rich
|
24
|
+
Requires-Dist: rich-argparse
|
22
25
|
Requires-Dist: scapy
|
23
26
|
Requires-Dist: seaborn
|
24
27
|
Description-Content-Type: text/markdown
|
@@ -24,14 +24,20 @@ dependencies = [
|
|
24
24
|
"pyfsdb",
|
25
25
|
"PyQt6-Charts",
|
26
26
|
"rich",
|
27
|
+
"rich_argparse",
|
27
28
|
"scapy",
|
28
29
|
"seaborn",
|
29
30
|
"cryptography",
|
30
31
|
"pyOpenSSL==22.1.0",
|
31
32
|
"dnssplitter",
|
32
33
|
"ip2asn",
|
34
|
+
"dotnest>=1.0",
|
35
|
+
"argparse-with-config>=1.1.4",
|
33
36
|
]
|
34
37
|
|
38
|
+
[project.package_data]
|
39
|
+
"traffic_taffy.iana" = ['tables.msgpak']
|
40
|
+
|
35
41
|
[project.scripts]
|
36
42
|
taffy-cache-info = "traffic_taffy.tools.cache_info:main"
|
37
43
|
taffy-compare = "traffic_taffy.tools.compare:main"
|
@@ -39,6 +45,7 @@ taffy-dissect = "traffic_taffy.tools.dissect:main"
|
|
39
45
|
taffy-explorer = "traffic_taffy.tools.explorer:main"
|
40
46
|
taffy-graph = "traffic_taffy.tools.graph:main"
|
41
47
|
taffy-export = "traffic_taffy.tools.export:main"
|
48
|
+
taffy-config = "traffic_taffy.tools.config:main"
|
42
49
|
|
43
50
|
[project.urls]
|
44
51
|
Homepage = "https://traffic-taffy.github.io/"
|
@@ -70,6 +77,10 @@ ignore = ["E501", "I001", "PLR0913", "ANN101", "ANN204",
|
|
70
77
|
"BLE001",
|
71
78
|
# allow for loop variable overrides
|
72
79
|
"PLW2901",
|
80
|
+
# disable "no blank line before class"
|
81
|
+
"D203",
|
82
|
+
# disable multi-line-summary-second-line
|
83
|
+
"D213",
|
73
84
|
]
|
74
85
|
fixable = ["ALL"] # gulp
|
75
86
|
# select = ["ALL"]
|
@@ -0,0 +1 @@
|
|
1
|
+
__VERSION__ = "0.9"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
"""traffic-taffy algorithm produce comparisons between different datasets."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
from typing import List, TYPE_CHECKING
|
5
|
+
from logging import error
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from traffic_taffy.dissection import Dissection
|
9
|
+
from traffic_taffy.reports import Report
|
10
|
+
|
11
|
+
|
12
|
+
class ComparisonAlgorithm:
|
13
|
+
"""A base class for all comparison algorithms."""
|
14
|
+
|
15
|
+
def __init__(self):
|
16
|
+
"""Construct a ComparisonAlgorithm."""
|
17
|
+
|
18
|
+
def compare_dissections(self, _dissections: List[Dissection]) -> List[Report]:
|
19
|
+
"""Compare dissections base function just to warn things are not implemented."""
|
20
|
+
error("code failure: base class compare_two_dissections should never be called")
|
21
|
+
raise ValueError
|
@@ -0,0 +1,164 @@
|
|
1
|
+
"""Compares datasets using DataFrame's correlation."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
from typing import List, TYPE_CHECKING
|
5
|
+
import pandas as pd
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
from logging import debug, warning, info
|
9
|
+
|
10
|
+
from traffic_taffy.algorithms.compareseries import ComparisonSeriesAlgorithm
|
11
|
+
from traffic_taffy.reports.correlationreport import CorrelationReport
|
12
|
+
from traffic_taffy.comparison import Comparison, OrganizedReports
|
13
|
+
from traffic_taffy.taffy_config import TaffyConfig, taffy_default
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from pandas import DataFrame
|
17
|
+
from numpy import ndarray
|
18
|
+
|
19
|
+
taffy_default("algorithms.correlation.minimum_correlation", 0.8)
|
20
|
+
taffy_default("algorithms.correlation.correlation_method", "spearman")
|
21
|
+
taffy_default("algorithms.correlation.max_pivot", 1000)
|
22
|
+
|
23
|
+
|
24
|
+
class CompareCorrelation(ComparisonSeriesAlgorithm):
|
25
|
+
"""Compare series using the pandas correlation."""
|
26
|
+
|
27
|
+
def __init__(
|
28
|
+
self,
|
29
|
+
timestamps: List[int] | None = None,
|
30
|
+
match_string: str | None = None,
|
31
|
+
match_value: str | None = None,
|
32
|
+
minimum_count: int | None = None,
|
33
|
+
make_printable: bool = False,
|
34
|
+
match_expression: str | None = None,
|
35
|
+
):
|
36
|
+
"""Create a CompareCorrelation instance.
|
37
|
+
|
38
|
+
Valid methods: kendall, pearson, spearman, corrcoef
|
39
|
+
|
40
|
+
speed-wise; pearson < spearman < corrcoef < kendall
|
41
|
+
|
42
|
+
accuracy-wise:
|
43
|
+
corrcoef: not great (uses np.corrcoef)
|
44
|
+
pearson: better but, not good
|
45
|
+
spearman: best
|
46
|
+
kendall: best
|
47
|
+
"""
|
48
|
+
super().__init__(
|
49
|
+
timestamps,
|
50
|
+
match_string,
|
51
|
+
match_value,
|
52
|
+
minimum_count,
|
53
|
+
make_printable,
|
54
|
+
match_expression,
|
55
|
+
)
|
56
|
+
self.method = None
|
57
|
+
|
58
|
+
def compare_series(
|
59
|
+
self, df: DataFrame, indexes: ndarray | None = None
|
60
|
+
) -> List[CorrelationReport]:
|
61
|
+
"""Compare a bunch of series using correlation.
|
62
|
+
|
63
|
+
This tries to do a comparison in a faster path if the number
|
64
|
+
of keys are reasonable (for if not a pivot will consume all
|
65
|
+
available memory)
|
66
|
+
"""
|
67
|
+
|
68
|
+
config = TaffyConfig()
|
69
|
+
minimum_correlation = float(
|
70
|
+
config.get_dotnest("algorithms.correlation.minimum_correlation")
|
71
|
+
)
|
72
|
+
self.minimum_correlation = minimum_correlation
|
73
|
+
|
74
|
+
max_pivot = int(config.get_dotnest("algorithms.correlation.max_pivot"))
|
75
|
+
method = config.get_dotnest("algorithms.correlation.correlation_method")
|
76
|
+
self.method = method
|
77
|
+
|
78
|
+
indexes = df["index"].unique()
|
79
|
+
num_indexes = len(indexes)
|
80
|
+
if num_indexes > max_pivot:
|
81
|
+
# we assume this is arbitrarily too large
|
82
|
+
# use the slower parent version instead
|
83
|
+
warning(
|
84
|
+
f"too many indexes ({num_indexes} > {max_pivot}) == using slower routine to conserve memory"
|
85
|
+
)
|
86
|
+
return super().compare_series(df, indexes)
|
87
|
+
|
88
|
+
info(f"Studying correlation of {num_indexes} indexes")
|
89
|
+
|
90
|
+
for key in ["subkey", "index", "filename"]:
|
91
|
+
del df[key]
|
92
|
+
df = df.pivot_table(
|
93
|
+
columns=["key"], index=["time"], values="count", fill_value=0
|
94
|
+
)
|
95
|
+
|
96
|
+
# indexes have changed
|
97
|
+
indexes = df.columns.to_list()
|
98
|
+
|
99
|
+
# use pandas internal kendall
|
100
|
+
# TODO(hardaker): np.corrcoef is multi-core but is pearsons
|
101
|
+
# TODO(hardaker): scipy.stat.kendalltau is kendall,
|
102
|
+
# but can only do one at a time
|
103
|
+
|
104
|
+
# TODO(hardaker): df.corr() returns different numbers here
|
105
|
+
# than inside compare_two_series!!
|
106
|
+
|
107
|
+
reports: OrganizedReports = {}
|
108
|
+
|
109
|
+
if method == "corrcoef":
|
110
|
+
np_array = df.to_numpy()
|
111
|
+
results = np.corrcoef(np_array)
|
112
|
+
for numx, column_left in enumerate(indexes):
|
113
|
+
for numy, column_right in enumerate(indexes[numx + 1 :]):
|
114
|
+
value = results[numx][numy]
|
115
|
+
# if value > minimum_correlation:
|
116
|
+
# print(
|
117
|
+
# f"{column_left:<30} similar to {column_right:<30}: {value}"
|
118
|
+
# )
|
119
|
+
return reports
|
120
|
+
|
121
|
+
# default to using the datafram corr method instead
|
122
|
+
df.fillna(0, inplace=True)
|
123
|
+
results = df.corr(method=method)
|
124
|
+
|
125
|
+
for num, column_left in enumerate(indexes):
|
126
|
+
for column_right in indexes[num + 1 :]:
|
127
|
+
value = results[column_left][column_right]
|
128
|
+
if value > minimum_correlation:
|
129
|
+
# print(f"{column_left:<30} similar to {column_right:<30}: {value}")
|
130
|
+
if column_left not in reports:
|
131
|
+
reports[column_left] = {}
|
132
|
+
reports[column_left][column_right] = CorrelationReport(
|
133
|
+
value,
|
134
|
+
)
|
135
|
+
return [Comparison(reports, "Correlation Report", "correlation")]
|
136
|
+
|
137
|
+
def compare_two_series(
|
138
|
+
self,
|
139
|
+
column_left: str,
|
140
|
+
series_left: list,
|
141
|
+
column_right: str,
|
142
|
+
series_right: list,
|
143
|
+
reports: OrganizedReports = None,
|
144
|
+
) -> dict:
|
145
|
+
"""Compare two series using the dataframe correlation algorithms."""
|
146
|
+
debug(f"correlation comparing {column_left} and {column_right}")
|
147
|
+
both = pd.concat([series_left, series_right], axis=1)
|
148
|
+
both.fillna(0, inplace=True)
|
149
|
+
|
150
|
+
# Note actually faster -- about the same as df.corr
|
151
|
+
# import scipy
|
152
|
+
# results = scipy.stats.kendalltau(both['left'], both['right'])
|
153
|
+
# value = results.statistic
|
154
|
+
|
155
|
+
results = both.corr(method=self.method)
|
156
|
+
value = results["left"][1]
|
157
|
+
debug(f"{column_left:<30} similar to {column_right:<30}: {value}")
|
158
|
+
|
159
|
+
if value > self.minimum_correlation:
|
160
|
+
# print(f"{column_left:<30} similar to {column_right:<30}: {value}")
|
161
|
+
|
162
|
+
return CorrelationReport(value)
|
163
|
+
|
164
|
+
return
|
@@ -0,0 +1,210 @@
|
|
1
|
+
"""Compares datasets using DataFrame's correlation."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
from typing import List, TYPE_CHECKING
|
5
|
+
import pandas as pd
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
from logging import debug, warning, info
|
9
|
+
|
10
|
+
from traffic_taffy.algorithms.compareseries import ComparisonSeriesAlgorithm
|
11
|
+
from traffic_taffy.reports.correlationchangereport import CorrelationChangeReport
|
12
|
+
from traffic_taffy.comparison import Comparison, OrganizedReports
|
13
|
+
from traffic_taffy.taffy_config import TaffyConfig, taffy_default
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from pandas import DataFrame
|
17
|
+
from numpy import ndarray
|
18
|
+
|
19
|
+
taffy_default("algorithms.correlationchanges.minimum_change", 0.5)
|
20
|
+
taffy_default("algorithms.correlationchanges.correlation_method", "spearman")
|
21
|
+
taffy_default("algorithms.correlationchanges.comparison_width", 15)
|
22
|
+
taffy_default("algorithms.correlationchanges.slide_length", None)
|
23
|
+
|
24
|
+
|
25
|
+
class CompareCorrelationChanges(ComparisonSeriesAlgorithm):
|
26
|
+
"""Compare series using the pandas correlation."""
|
27
|
+
|
28
|
+
MAX_PIVOT = 1000
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
timestamps: List[int] | None = None,
|
33
|
+
match_string: str | None = None,
|
34
|
+
match_value: str | None = None,
|
35
|
+
minimum_count: int | None = None,
|
36
|
+
make_printable: bool = False,
|
37
|
+
match_expression: str | None = None,
|
38
|
+
):
|
39
|
+
"""Create a CompareCorrelationChanges instance.
|
40
|
+
|
41
|
+
Valid methods: kendall, pearson, spearman, corrcoef
|
42
|
+
|
43
|
+
speed-wise; pearson < spearman < corrcoef < kendall
|
44
|
+
|
45
|
+
accuracy-wise:
|
46
|
+
corrcoef: not great (uses np.corrcoef)
|
47
|
+
pearson: better but, not good
|
48
|
+
spearman: best
|
49
|
+
kendall: best
|
50
|
+
"""
|
51
|
+
super().__init__(
|
52
|
+
timestamps,
|
53
|
+
match_string,
|
54
|
+
match_value,
|
55
|
+
minimum_count,
|
56
|
+
make_printable,
|
57
|
+
match_expression,
|
58
|
+
)
|
59
|
+
self.method = None
|
60
|
+
|
61
|
+
def compare_series(
|
62
|
+
self, df: DataFrame, indexes: ndarray | None = None
|
63
|
+
) -> List[CorrelationChangeReport]:
|
64
|
+
"""Compare a bunch of series looking for changes in correlation.
|
65
|
+
|
66
|
+
This tries to do a comparison in a faster path if the number
|
67
|
+
of keys are reasonable (for if not a pivot will consume all
|
68
|
+
available memory)
|
69
|
+
"""
|
70
|
+
|
71
|
+
self.sort_by = "delta_correlation"
|
72
|
+
|
73
|
+
config = TaffyConfig()
|
74
|
+
minimum_change = float(
|
75
|
+
config.get_dotnest("algorithms.correlationchanges.minimum_change", 0.3)
|
76
|
+
)
|
77
|
+
self.minimum_change = minimum_change
|
78
|
+
|
79
|
+
method = config.get_dotnest("algorithms.correlationchanges.correlation_method")
|
80
|
+
self.method = method
|
81
|
+
|
82
|
+
comparison_width = config.get_dotnest(
|
83
|
+
"algorithms.correlationchanges.comparison_width"
|
84
|
+
)
|
85
|
+
self.comparison_width = comparison_width
|
86
|
+
|
87
|
+
slide_length = config.get_dotnest("algorithms.correlationchanges.slide_length")
|
88
|
+
if not slide_length:
|
89
|
+
slide_length = comparison_width
|
90
|
+
self.slide_length = slide_length
|
91
|
+
|
92
|
+
indexes = df["index"].unique()
|
93
|
+
num_indexes = len(indexes)
|
94
|
+
info(
|
95
|
+
f"starting correlation changes comparison: num_indexes={num_indexes}, min_change={self.minimum_change}"
|
96
|
+
)
|
97
|
+
|
98
|
+
# TODO(hardaker): use a full sweeping comparison for faster correlations
|
99
|
+
# now we just revert to the slower non-pivot method for proof of concept
|
100
|
+
return super().compare_series(df, indexes)
|
101
|
+
|
102
|
+
if num_indexes > self.MAX_PIVOT:
|
103
|
+
# we assume this is arbitrarily too large
|
104
|
+
# use the slower parent version instead
|
105
|
+
warning(
|
106
|
+
f"too many indexes ({num_indexes} > {self.MAX_PIVOT}) == using slower routine to conserve memory"
|
107
|
+
)
|
108
|
+
return super().compare_series(df, indexes)
|
109
|
+
|
110
|
+
for key in ["subkey", "index", "filename"]:
|
111
|
+
del df[key]
|
112
|
+
df = df.pivot_table(
|
113
|
+
columns=["key"], index=["time"], values="count", fill_value=0
|
114
|
+
)
|
115
|
+
|
116
|
+
# indexes have changed
|
117
|
+
indexes = df.columns.to_list()
|
118
|
+
|
119
|
+
# use pandas internal kendall
|
120
|
+
# TODO(hardaker): np.corrcoef is multi-core but is pearsons
|
121
|
+
# TODO(hardaker): scipy.stat.kendalltau is kendall,
|
122
|
+
# but can only do one at a time
|
123
|
+
|
124
|
+
# TODO(hardaker): df.corr() returns different numbers here
|
125
|
+
# than inside compare_two_series!!
|
126
|
+
|
127
|
+
reports: OrganizedReports = {}
|
128
|
+
|
129
|
+
if method == "corrcoef":
|
130
|
+
np_array = df.to_numpy()
|
131
|
+
results = np.corrcoef(np_array)
|
132
|
+
for numx, column_left in enumerate(indexes):
|
133
|
+
for numy, column_right in enumerate(indexes[numx + 1 :]):
|
134
|
+
value = results[numx][numy]
|
135
|
+
# if value > minimum_value:
|
136
|
+
# print(
|
137
|
+
# f"{column_left:<30} similar to {column_right:<30}: {value}"
|
138
|
+
# )
|
139
|
+
return reports
|
140
|
+
|
141
|
+
# default to using the datafram corr method instead
|
142
|
+
results = df.corr(method=method)
|
143
|
+
|
144
|
+
# TODO(hardaker): this doesn't actually do anything
|
145
|
+
# need to break correlation into pieces and run multiple passes
|
146
|
+
|
147
|
+
for num, column_left in enumerate(indexes):
|
148
|
+
for column_right in indexes[num + 1 :]:
|
149
|
+
value = results[column_left][column_right]
|
150
|
+
if value > self.minimum_change:
|
151
|
+
# print(f"{column_left:<30} similar to {column_right:<30}: {value}")
|
152
|
+
if column_left not in reports:
|
153
|
+
reports[column_left] = {}
|
154
|
+
reports[column_left][column_right] = CorrelationChangeReport(
|
155
|
+
value,
|
156
|
+
)
|
157
|
+
|
158
|
+
return [Comparison(reports, "Correlation Report", "delta_correlation")]
|
159
|
+
|
160
|
+
def compare_two_series(
|
161
|
+
self,
|
162
|
+
column_left: str,
|
163
|
+
series_left: list,
|
164
|
+
column_right: str,
|
165
|
+
series_right: list,
|
166
|
+
) -> CorrelationChangeReport | None:
|
167
|
+
"""Compare two series using the dataframe correlation algorithms."""
|
168
|
+
debug(f"correlation comparing {column_left} and {column_right}")
|
169
|
+
both = pd.concat([series_left, series_right], axis=1)
|
170
|
+
both.fillna(0, inplace=True)
|
171
|
+
|
172
|
+
# Note actually faster -- about the same as df.corr
|
173
|
+
# import scipy
|
174
|
+
# results = scipy.stats.kendalltau(both['left'], both['right'])
|
175
|
+
# value = results.statistic
|
176
|
+
|
177
|
+
start_index: int = 0
|
178
|
+
middle_index: int = self.comparison_width
|
179
|
+
end_index: int = 2 * self.comparison_width
|
180
|
+
|
181
|
+
data_length = len(both)
|
182
|
+
|
183
|
+
while end_index < data_length:
|
184
|
+
left_correlation = both[start_index:middle_index].corr(self.method)["left"][
|
185
|
+
"right"
|
186
|
+
]
|
187
|
+
right_correlation = both[middle_index:end_index].corr(self.method)["left"][
|
188
|
+
"right"
|
189
|
+
]
|
190
|
+
delta_correlation = right_correlation - left_correlation
|
191
|
+
|
192
|
+
# well this is ugly:
|
193
|
+
timestamp = (
|
194
|
+
both[middle_index : middle_index + 1]
|
195
|
+
.index.to_pydatetime()[0]
|
196
|
+
.timestamp()
|
197
|
+
)
|
198
|
+
|
199
|
+
debug(f" {right_correlation} - {left_correlation} = {delta_correlation}")
|
200
|
+
if abs(delta_correlation) >= self.minimum_change:
|
201
|
+
return CorrelationChangeReport(
|
202
|
+
left_correlation, right_correlation, delta_correlation, timestamp
|
203
|
+
)
|
204
|
+
|
205
|
+
start_index += self.slide_length
|
206
|
+
middle_index += self.slide_length
|
207
|
+
end_index += self.slide_length
|
208
|
+
|
209
|
+
# if we get here there are no change points found
|
210
|
+
return
|
@@ -0,0 +1,117 @@
|
|
1
|
+
"""Compares datasets in time-series rather than by series."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
from typing import List, TYPE_CHECKING
|
5
|
+
from traffic_taffy.algorithms import ComparisonAlgorithm
|
6
|
+
from traffic_taffy.graphdata import PcapGraphData
|
7
|
+
from traffic_taffy.comparison import Comparison, OrganizedReports
|
8
|
+
|
9
|
+
from logging import error
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from traffic_taffy.dissection import Dissection
|
13
|
+
from pandas import DataFrame
|
14
|
+
from numpy import ndarray
|
15
|
+
|
16
|
+
|
17
|
+
class ComparisonSeriesAlgorithm(ComparisonAlgorithm):
|
18
|
+
"""A base class for algorithms that compare left/right series."""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
timestamps: List[int] | None = None,
|
23
|
+
match_string: str | None = None,
|
24
|
+
match_value: str | None = None,
|
25
|
+
minimum_count: int | None = None,
|
26
|
+
make_printable: bool = False,
|
27
|
+
match_expression: str | None = None,
|
28
|
+
):
|
29
|
+
"""Create a ComparisonAlgorithm."""
|
30
|
+
self.timestamps = timestamps
|
31
|
+
self.match_string = match_string
|
32
|
+
self.match_value = match_value
|
33
|
+
self.minimum_count = minimum_count
|
34
|
+
self.make_printable = make_printable
|
35
|
+
self.match_expression = (match_expression,)
|
36
|
+
self.sort_by = "correlation"
|
37
|
+
|
38
|
+
def compare_two_series(
|
39
|
+
self,
|
40
|
+
_column_left: str,
|
41
|
+
_series_left: list,
|
42
|
+
_column_right: str,
|
43
|
+
_series_right: list,
|
44
|
+
) -> dict:
|
45
|
+
"""Error catching base class function for comparing two columnar series."""
|
46
|
+
error("code failure: base class compare_two_series should never be called")
|
47
|
+
raise ValueError
|
48
|
+
|
49
|
+
def compare_dissections(self, dissections: List[Dissection]) -> List[Comparison]:
|
50
|
+
"""Compare all the column series."""
|
51
|
+
# hack to figure out if there is at least two instances of a generator
|
52
|
+
# without actually extracting them all
|
53
|
+
# (since it could be memory expensive)
|
54
|
+
|
55
|
+
# merge all dissections together into one
|
56
|
+
# TODO(hardaker): ideally this should be a parameter
|
57
|
+
# forced upward into dissectmany
|
58
|
+
dissection = next(dissections)
|
59
|
+
for to_be_merged in dissections:
|
60
|
+
dissection.merge(to_be_merged)
|
61
|
+
|
62
|
+
# filter downward
|
63
|
+
dissection = dissection.filter(
|
64
|
+
self.timestamps,
|
65
|
+
self.match_string,
|
66
|
+
self.match_value,
|
67
|
+
self.minimum_count,
|
68
|
+
self.make_printable,
|
69
|
+
self.match_expression,
|
70
|
+
)
|
71
|
+
|
72
|
+
data = PcapGraphData()
|
73
|
+
data.dissections = [dissection]
|
74
|
+
# data.normalize_bins() ?
|
75
|
+
df = data.get_dataframe()
|
76
|
+
|
77
|
+
return self.compare_series(df)
|
78
|
+
|
79
|
+
def compare_series(
|
80
|
+
self, df: DataFrame, indexes: ndarray | None = None
|
81
|
+
) -> List[Comparison]:
|
82
|
+
"""Compares the series found in a dataframe, two at a time."""
|
83
|
+
|
84
|
+
reports: OrganizedReports = {}
|
85
|
+
|
86
|
+
if indexes is None:
|
87
|
+
indexes = df["index"].unique()
|
88
|
+
|
89
|
+
for num, column_left in enumerate(indexes):
|
90
|
+
series_left = df[df["index"] == column_left]
|
91
|
+
series_left = series_left.set_index("time")
|
92
|
+
series_left = series_left["count"]
|
93
|
+
series_left.name = "left"
|
94
|
+
|
95
|
+
# TODO(hardaker): n^2 is bad
|
96
|
+
for column_right in indexes[num + 1 :]:
|
97
|
+
if column_left == column_right:
|
98
|
+
continue
|
99
|
+
|
100
|
+
series_right = df[df["index"] == column_right]
|
101
|
+
series_right = series_right.set_index("time")
|
102
|
+
series_right = series_right["count"]
|
103
|
+
series_right.name = "right"
|
104
|
+
|
105
|
+
report = self.compare_two_series(
|
106
|
+
column_left, series_left, column_right, series_right
|
107
|
+
)
|
108
|
+
if column_left not in reports:
|
109
|
+
reports[column_left] = {}
|
110
|
+
|
111
|
+
if isinstance(report, list):
|
112
|
+
# TODO(hardaker): we don't actually handle arrays yet
|
113
|
+
reports[column_left][column_right].extend(report)
|
114
|
+
elif report:
|
115
|
+
reports[column_left][column_right] = report
|
116
|
+
|
117
|
+
return [Comparison(reports, "Correlation Report", self.sort_by)]
|