traffic-taffy 0.8.1__py3-none-any.whl → 0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. traffic_taffy/__init__.py +1 -1
  2. traffic_taffy/algorithms/__init__.py +14 -7
  3. traffic_taffy/algorithms/comparecorrelation.py +164 -0
  4. traffic_taffy/algorithms/comparecorrelationchanges.py +210 -0
  5. traffic_taffy/algorithms/compareseries.py +117 -0
  6. traffic_taffy/algorithms/compareslices.py +116 -0
  7. traffic_taffy/algorithms/statistical.py +9 -9
  8. traffic_taffy/compare.py +149 -159
  9. traffic_taffy/comparison.py +18 -4
  10. traffic_taffy/config.py +133 -0
  11. traffic_taffy/dissection.py +171 -6
  12. traffic_taffy/dissectmany.py +26 -16
  13. traffic_taffy/dissector.py +189 -77
  14. traffic_taffy/dissector_engine/scapy.py +41 -8
  15. traffic_taffy/graph.py +54 -53
  16. traffic_taffy/graphdata.py +13 -2
  17. traffic_taffy/hooks/ip2asn.py +20 -7
  18. traffic_taffy/hooks/labels.py +45 -0
  19. traffic_taffy/hooks/psl.py +21 -3
  20. traffic_taffy/iana/tables.msgpak +0 -0
  21. traffic_taffy/output/__init__.py +8 -48
  22. traffic_taffy/output/console.py +37 -25
  23. traffic_taffy/output/fsdb.py +24 -18
  24. traffic_taffy/reports/__init__.py +5 -0
  25. traffic_taffy/reports/compareslicesreport.py +85 -0
  26. traffic_taffy/reports/correlationchangereport.py +54 -0
  27. traffic_taffy/reports/correlationreport.py +42 -0
  28. traffic_taffy/taffy_config.py +44 -0
  29. traffic_taffy/tests/test_compare_results.py +22 -7
  30. traffic_taffy/tests/test_config.py +149 -0
  31. traffic_taffy/tests/test_global_config.py +33 -0
  32. traffic_taffy/tests/test_normalize.py +1 -0
  33. traffic_taffy/tests/test_pcap_dissector.py +12 -2
  34. traffic_taffy/tests/test_pcap_splitter.py +21 -10
  35. traffic_taffy/tools/cache_info.py +3 -2
  36. traffic_taffy/tools/compare.py +32 -24
  37. traffic_taffy/tools/config.py +83 -0
  38. traffic_taffy/tools/dissect.py +51 -59
  39. traffic_taffy/tools/explore.py +5 -4
  40. traffic_taffy/tools/export.py +28 -17
  41. traffic_taffy/tools/graph.py +25 -27
  42. {traffic_taffy-0.8.1.dist-info → traffic_taffy-0.9.dist-info}/METADATA +4 -1
  43. traffic_taffy-0.9.dist-info/RECORD +56 -0
  44. {traffic_taffy-0.8.1.dist-info → traffic_taffy-0.9.dist-info}/entry_points.txt +1 -0
  45. traffic_taffy/report.py +0 -12
  46. traffic_taffy/tests/test_dpkt_engine.py +0 -15
  47. traffic_taffy-0.8.1.dist-info/RECORD +0 -43
  48. {traffic_taffy-0.8.1.dist-info → traffic_taffy-0.9.dist-info}/WHEEL +0 -0
  49. {traffic_taffy-0.8.1.dist-info → traffic_taffy-0.9.dist-info}/licenses/LICENSE.txt +0 -0
traffic_taffy/__init__.py CHANGED
@@ -1 +1 @@
1
- __VERSION__ = "0.8.1"
1
+ __VERSION__ = "0.9"
@@ -1,14 +1,21 @@
1
- from typing import TYPE_CHECKING
1
+ """traffic-taffy algorithm produce comparisons between different datasets."""
2
+
3
+ from __future__ import annotations
4
+ from typing import List, TYPE_CHECKING
5
+ from logging import error
2
6
 
3
7
  if TYPE_CHECKING:
4
- pass
8
+ from traffic_taffy.dissection import Dissection
9
+ from traffic_taffy.reports import Report
5
10
 
6
11
 
7
12
  class ComparisonAlgorithm:
13
+ """A base class for all comparison algorithms."""
14
+
8
15
  def __init__(self):
9
- pass
16
+ """Construct a ComparisonAlgorithm."""
10
17
 
11
- def compare_dissections(left_side: dict, right_side: dict) -> dict:
12
- raise ValueError(
13
- "code failure: base class compare_dissections should never be called"
14
- )
18
+ def compare_dissections(self, _dissections: List[Dissection]) -> List[Report]:
19
+ """Compare dissections base function just to warn things are not implemented."""
20
+ error("code failure: base class compare_two_dissections should never be called")
21
+ raise ValueError
@@ -0,0 +1,164 @@
1
+ """Compares datasets using DataFrame's correlation."""
2
+
3
+ from __future__ import annotations
4
+ from typing import List, TYPE_CHECKING
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ from logging import debug, warning, info
9
+
10
+ from traffic_taffy.algorithms.compareseries import ComparisonSeriesAlgorithm
11
+ from traffic_taffy.reports.correlationreport import CorrelationReport
12
+ from traffic_taffy.comparison import Comparison, OrganizedReports
13
+ from traffic_taffy.taffy_config import TaffyConfig, taffy_default
14
+
15
+ if TYPE_CHECKING:
16
+ from pandas import DataFrame
17
+ from numpy import ndarray
18
+
19
+ taffy_default("algorithms.correlation.minimum_correlation", 0.8)
20
+ taffy_default("algorithms.correlation.correlation_method", "spearman")
21
+ taffy_default("algorithms.correlation.max_pivot", 1000)
22
+
23
+
24
+ class CompareCorrelation(ComparisonSeriesAlgorithm):
25
+ """Compare series using the pandas correlation."""
26
+
27
+ def __init__(
28
+ self,
29
+ timestamps: List[int] | None = None,
30
+ match_string: str | None = None,
31
+ match_value: str | None = None,
32
+ minimum_count: int | None = None,
33
+ make_printable: bool = False,
34
+ match_expression: str | None = None,
35
+ ):
36
+ """Create a CompareCorrelation instance.
37
+
38
+ Valid methods: kendall, pearson, spearman, corrcoef
39
+
40
+ speed-wise; pearson < spearman < corrcoef < kendall
41
+
42
+ accuracy-wise:
43
+ corrcoef: not great (uses np.corrcoef)
44
+ pearson: better but, not good
45
+ spearman: best
46
+ kendall: best
47
+ """
48
+ super().__init__(
49
+ timestamps,
50
+ match_string,
51
+ match_value,
52
+ minimum_count,
53
+ make_printable,
54
+ match_expression,
55
+ )
56
+ self.method = None
57
+
58
+ def compare_series(
59
+ self, df: DataFrame, indexes: ndarray | None = None
60
+ ) -> List[CorrelationReport]:
61
+ """Compare a bunch of series using correlation.
62
+
63
+ This tries to do a comparison in a faster path if the number
64
+ of keys are reasonable (for if not a pivot will consume all
65
+ available memory)
66
+ """
67
+
68
+ config = TaffyConfig()
69
+ minimum_correlation = float(
70
+ config.get_dotnest("algorithms.correlation.minimum_correlation")
71
+ )
72
+ self.minimum_correlation = minimum_correlation
73
+
74
+ max_pivot = int(config.get_dotnest("algorithms.correlation.max_pivot"))
75
+ method = config.get_dotnest("algorithms.correlation.correlation_method")
76
+ self.method = method
77
+
78
+ indexes = df["index"].unique()
79
+ num_indexes = len(indexes)
80
+ if num_indexes > max_pivot:
81
+ # we assume this is arbitrarily too large
82
+ # use the slower parent version instead
83
+ warning(
84
+ f"too many indexes ({num_indexes} > {max_pivot}) == using slower routine to conserve memory"
85
+ )
86
+ return super().compare_series(df, indexes)
87
+
88
+ info(f"Studying correlation of {num_indexes} indexes")
89
+
90
+ for key in ["subkey", "index", "filename"]:
91
+ del df[key]
92
+ df = df.pivot_table(
93
+ columns=["key"], index=["time"], values="count", fill_value=0
94
+ )
95
+
96
+ # indexes have changed
97
+ indexes = df.columns.to_list()
98
+
99
+ # use pandas internal kendall
100
+ # TODO(hardaker): np.corrcoef is multi-core but is pearsons
101
+ # TODO(hardaker): scipy.stat.kendalltau is kendall,
102
+ # but can only do one at a time
103
+
104
+ # TODO(hardaker): df.corr() returns different numbers here
105
+ # than inside compare_two_series!!
106
+
107
+ reports: OrganizedReports = {}
108
+
109
+ if method == "corrcoef":
110
+ np_array = df.to_numpy()
111
+ results = np.corrcoef(np_array)
112
+ for numx, column_left in enumerate(indexes):
113
+ for numy, column_right in enumerate(indexes[numx + 1 :]):
114
+ value = results[numx][numy]
115
+ # if value > minimum_correlation:
116
+ # print(
117
+ # f"{column_left:<30} similar to {column_right:<30}: {value}"
118
+ # )
119
+ return reports
120
+
121
+ # default to using the datafram corr method instead
122
+ df.fillna(0, inplace=True)
123
+ results = df.corr(method=method)
124
+
125
+ for num, column_left in enumerate(indexes):
126
+ for column_right in indexes[num + 1 :]:
127
+ value = results[column_left][column_right]
128
+ if value > minimum_correlation:
129
+ # print(f"{column_left:<30} similar to {column_right:<30}: {value}")
130
+ if column_left not in reports:
131
+ reports[column_left] = {}
132
+ reports[column_left][column_right] = CorrelationReport(
133
+ value,
134
+ )
135
+ return [Comparison(reports, "Correlation Report", "correlation")]
136
+
137
+ def compare_two_series(
138
+ self,
139
+ column_left: str,
140
+ series_left: list,
141
+ column_right: str,
142
+ series_right: list,
143
+ reports: OrganizedReports = None,
144
+ ) -> dict:
145
+ """Compare two series using the dataframe correlation algorithms."""
146
+ debug(f"correlation comparing {column_left} and {column_right}")
147
+ both = pd.concat([series_left, series_right], axis=1)
148
+ both.fillna(0, inplace=True)
149
+
150
+ # Note actually faster -- about the same as df.corr
151
+ # import scipy
152
+ # results = scipy.stats.kendalltau(both['left'], both['right'])
153
+ # value = results.statistic
154
+
155
+ results = both.corr(method=self.method)
156
+ value = results["left"][1]
157
+ debug(f"{column_left:<30} similar to {column_right:<30}: {value}")
158
+
159
+ if value > self.minimum_correlation:
160
+ # print(f"{column_left:<30} similar to {column_right:<30}: {value}")
161
+
162
+ return CorrelationReport(value)
163
+
164
+ return
@@ -0,0 +1,210 @@
1
+ """Compares datasets using DataFrame's correlation."""
2
+
3
+ from __future__ import annotations
4
+ from typing import List, TYPE_CHECKING
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ from logging import debug, warning, info
9
+
10
+ from traffic_taffy.algorithms.compareseries import ComparisonSeriesAlgorithm
11
+ from traffic_taffy.reports.correlationchangereport import CorrelationChangeReport
12
+ from traffic_taffy.comparison import Comparison, OrganizedReports
13
+ from traffic_taffy.taffy_config import TaffyConfig, taffy_default
14
+
15
+ if TYPE_CHECKING:
16
+ from pandas import DataFrame
17
+ from numpy import ndarray
18
+
19
+ taffy_default("algorithms.correlationchanges.minimum_change", 0.5)
20
+ taffy_default("algorithms.correlationchanges.correlation_method", "spearman")
21
+ taffy_default("algorithms.correlationchanges.comparison_width", 15)
22
+ taffy_default("algorithms.correlationchanges.slide_length", None)
23
+
24
+
25
+ class CompareCorrelationChanges(ComparisonSeriesAlgorithm):
26
+ """Compare series using the pandas correlation."""
27
+
28
+ MAX_PIVOT = 1000
29
+
30
+ def __init__(
31
+ self,
32
+ timestamps: List[int] | None = None,
33
+ match_string: str | None = None,
34
+ match_value: str | None = None,
35
+ minimum_count: int | None = None,
36
+ make_printable: bool = False,
37
+ match_expression: str | None = None,
38
+ ):
39
+ """Create a CompareCorrelationChanges instance.
40
+
41
+ Valid methods: kendall, pearson, spearman, corrcoef
42
+
43
+ speed-wise; pearson < spearman < corrcoef < kendall
44
+
45
+ accuracy-wise:
46
+ corrcoef: not great (uses np.corrcoef)
47
+ pearson: better but, not good
48
+ spearman: best
49
+ kendall: best
50
+ """
51
+ super().__init__(
52
+ timestamps,
53
+ match_string,
54
+ match_value,
55
+ minimum_count,
56
+ make_printable,
57
+ match_expression,
58
+ )
59
+ self.method = None
60
+
61
+ def compare_series(
62
+ self, df: DataFrame, indexes: ndarray | None = None
63
+ ) -> List[CorrelationChangeReport]:
64
+ """Compare a bunch of series looking for changes in correlation.
65
+
66
+ This tries to do a comparison in a faster path if the number
67
+ of keys are reasonable (for if not a pivot will consume all
68
+ available memory)
69
+ """
70
+
71
+ self.sort_by = "delta_correlation"
72
+
73
+ config = TaffyConfig()
74
+ minimum_change = float(
75
+ config.get_dotnest("algorithms.correlationchanges.minimum_change", 0.3)
76
+ )
77
+ self.minimum_change = minimum_change
78
+
79
+ method = config.get_dotnest("algorithms.correlationchanges.correlation_method")
80
+ self.method = method
81
+
82
+ comparison_width = config.get_dotnest(
83
+ "algorithms.correlationchanges.comparison_width"
84
+ )
85
+ self.comparison_width = comparison_width
86
+
87
+ slide_length = config.get_dotnest("algorithms.correlationchanges.slide_length")
88
+ if not slide_length:
89
+ slide_length = comparison_width
90
+ self.slide_length = slide_length
91
+
92
+ indexes = df["index"].unique()
93
+ num_indexes = len(indexes)
94
+ info(
95
+ f"starting correlation changes comparison: num_indexes={num_indexes}, min_change={self.minimum_change}"
96
+ )
97
+
98
+ # TODO(hardaker): use a full sweeping comparison for faster correlations
99
+ # now we just revert to the slower non-pivot method for proof of concept
100
+ return super().compare_series(df, indexes)
101
+
102
+ if num_indexes > self.MAX_PIVOT:
103
+ # we assume this is arbitrarily too large
104
+ # use the slower parent version instead
105
+ warning(
106
+ f"too many indexes ({num_indexes} > {self.MAX_PIVOT}) == using slower routine to conserve memory"
107
+ )
108
+ return super().compare_series(df, indexes)
109
+
110
+ for key in ["subkey", "index", "filename"]:
111
+ del df[key]
112
+ df = df.pivot_table(
113
+ columns=["key"], index=["time"], values="count", fill_value=0
114
+ )
115
+
116
+ # indexes have changed
117
+ indexes = df.columns.to_list()
118
+
119
+ # use pandas internal kendall
120
+ # TODO(hardaker): np.corrcoef is multi-core but is pearsons
121
+ # TODO(hardaker): scipy.stat.kendalltau is kendall,
122
+ # but can only do one at a time
123
+
124
+ # TODO(hardaker): df.corr() returns different numbers here
125
+ # than inside compare_two_series!!
126
+
127
+ reports: OrganizedReports = {}
128
+
129
+ if method == "corrcoef":
130
+ np_array = df.to_numpy()
131
+ results = np.corrcoef(np_array)
132
+ for numx, column_left in enumerate(indexes):
133
+ for numy, column_right in enumerate(indexes[numx + 1 :]):
134
+ value = results[numx][numy]
135
+ # if value > minimum_value:
136
+ # print(
137
+ # f"{column_left:<30} similar to {column_right:<30}: {value}"
138
+ # )
139
+ return reports
140
+
141
+ # default to using the datafram corr method instead
142
+ results = df.corr(method=method)
143
+
144
+ # TODO(hardaker): this doesn't actually do anything
145
+ # need to break correlation into pieces and run multiple passes
146
+
147
+ for num, column_left in enumerate(indexes):
148
+ for column_right in indexes[num + 1 :]:
149
+ value = results[column_left][column_right]
150
+ if value > self.minimum_change:
151
+ # print(f"{column_left:<30} similar to {column_right:<30}: {value}")
152
+ if column_left not in reports:
153
+ reports[column_left] = {}
154
+ reports[column_left][column_right] = CorrelationChangeReport(
155
+ value,
156
+ )
157
+
158
+ return [Comparison(reports, "Correlation Report", "delta_correlation")]
159
+
160
+ def compare_two_series(
161
+ self,
162
+ column_left: str,
163
+ series_left: list,
164
+ column_right: str,
165
+ series_right: list,
166
+ ) -> CorrelationChangeReport | None:
167
+ """Compare two series using the dataframe correlation algorithms."""
168
+ debug(f"correlation comparing {column_left} and {column_right}")
169
+ both = pd.concat([series_left, series_right], axis=1)
170
+ both.fillna(0, inplace=True)
171
+
172
+ # Note actually faster -- about the same as df.corr
173
+ # import scipy
174
+ # results = scipy.stats.kendalltau(both['left'], both['right'])
175
+ # value = results.statistic
176
+
177
+ start_index: int = 0
178
+ middle_index: int = self.comparison_width
179
+ end_index: int = 2 * self.comparison_width
180
+
181
+ data_length = len(both)
182
+
183
+ while end_index < data_length:
184
+ left_correlation = both[start_index:middle_index].corr(self.method)["left"][
185
+ "right"
186
+ ]
187
+ right_correlation = both[middle_index:end_index].corr(self.method)["left"][
188
+ "right"
189
+ ]
190
+ delta_correlation = right_correlation - left_correlation
191
+
192
+ # well this is ugly:
193
+ timestamp = (
194
+ both[middle_index : middle_index + 1]
195
+ .index.to_pydatetime()[0]
196
+ .timestamp()
197
+ )
198
+
199
+ debug(f" {right_correlation} - {left_correlation} = {delta_correlation}")
200
+ if abs(delta_correlation) >= self.minimum_change:
201
+ return CorrelationChangeReport(
202
+ left_correlation, right_correlation, delta_correlation, timestamp
203
+ )
204
+
205
+ start_index += self.slide_length
206
+ middle_index += self.slide_length
207
+ end_index += self.slide_length
208
+
209
+ # if we get here there are no change points found
210
+ return
@@ -0,0 +1,117 @@
1
+ """Compares datasets in time-series rather than by series."""
2
+
3
+ from __future__ import annotations
4
+ from typing import List, TYPE_CHECKING
5
+ from traffic_taffy.algorithms import ComparisonAlgorithm
6
+ from traffic_taffy.graphdata import PcapGraphData
7
+ from traffic_taffy.comparison import Comparison, OrganizedReports
8
+
9
+ from logging import error
10
+
11
+ if TYPE_CHECKING:
12
+ from traffic_taffy.dissection import Dissection
13
+ from pandas import DataFrame
14
+ from numpy import ndarray
15
+
16
+
17
+ class ComparisonSeriesAlgorithm(ComparisonAlgorithm):
18
+ """A base class for algorithms that compare left/right series."""
19
+
20
+ def __init__(
21
+ self,
22
+ timestamps: List[int] | None = None,
23
+ match_string: str | None = None,
24
+ match_value: str | None = None,
25
+ minimum_count: int | None = None,
26
+ make_printable: bool = False,
27
+ match_expression: str | None = None,
28
+ ):
29
+ """Create a ComparisonAlgorithm."""
30
+ self.timestamps = timestamps
31
+ self.match_string = match_string
32
+ self.match_value = match_value
33
+ self.minimum_count = minimum_count
34
+ self.make_printable = make_printable
35
+ self.match_expression = (match_expression,)
36
+ self.sort_by = "correlation"
37
+
38
+ def compare_two_series(
39
+ self,
40
+ _column_left: str,
41
+ _series_left: list,
42
+ _column_right: str,
43
+ _series_right: list,
44
+ ) -> dict:
45
+ """Error catching base class function for comparing two columnar series."""
46
+ error("code failure: base class compare_two_series should never be called")
47
+ raise ValueError
48
+
49
+ def compare_dissections(self, dissections: List[Dissection]) -> List[Comparison]:
50
+ """Compare all the column series."""
51
+ # hack to figure out if there is at least two instances of a generator
52
+ # without actually extracting them all
53
+ # (since it could be memory expensive)
54
+
55
+ # merge all dissections together into one
56
+ # TODO(hardaker): ideally this should be a parameter
57
+ # forced upward into dissectmany
58
+ dissection = next(dissections)
59
+ for to_be_merged in dissections:
60
+ dissection.merge(to_be_merged)
61
+
62
+ # filter downward
63
+ dissection = dissection.filter(
64
+ self.timestamps,
65
+ self.match_string,
66
+ self.match_value,
67
+ self.minimum_count,
68
+ self.make_printable,
69
+ self.match_expression,
70
+ )
71
+
72
+ data = PcapGraphData()
73
+ data.dissections = [dissection]
74
+ # data.normalize_bins() ?
75
+ df = data.get_dataframe()
76
+
77
+ return self.compare_series(df)
78
+
79
+ def compare_series(
80
+ self, df: DataFrame, indexes: ndarray | None = None
81
+ ) -> List[Comparison]:
82
+ """Compares the series found in a dataframe, two at a time."""
83
+
84
+ reports: OrganizedReports = {}
85
+
86
+ if indexes is None:
87
+ indexes = df["index"].unique()
88
+
89
+ for num, column_left in enumerate(indexes):
90
+ series_left = df[df["index"] == column_left]
91
+ series_left = series_left.set_index("time")
92
+ series_left = series_left["count"]
93
+ series_left.name = "left"
94
+
95
+ # TODO(hardaker): n^2 is bad
96
+ for column_right in indexes[num + 1 :]:
97
+ if column_left == column_right:
98
+ continue
99
+
100
+ series_right = df[df["index"] == column_right]
101
+ series_right = series_right.set_index("time")
102
+ series_right = series_right["count"]
103
+ series_right.name = "right"
104
+
105
+ report = self.compare_two_series(
106
+ column_left, series_left, column_right, series_right
107
+ )
108
+ if column_left not in reports:
109
+ reports[column_left] = {}
110
+
111
+ if isinstance(report, list):
112
+ # TODO(hardaker): we don't actually handle arrays yet
113
+ reports[column_left][column_right].extend(report)
114
+ elif report:
115
+ reports[column_left][column_right] = report
116
+
117
+ return [Comparison(reports, "Correlation Report", self.sort_by)]
@@ -0,0 +1,116 @@
1
+ """Compares datasets in time-slices rather than by series."""
2
+
3
+ from __future__ import annotations
4
+ from typing import List, TYPE_CHECKING
5
+ from traffic_taffy.algorithms import ComparisonAlgorithm
6
+ import itertools
7
+ import datetime as dt
8
+
9
+ from logging import debug, error, exception
10
+
11
+ if TYPE_CHECKING:
12
+ from traffic_taffy.dissection import Dissection
13
+ from traffic_taffy.comparison import Comparison
14
+
15
+
16
+ class ComparisonSlicesAlgorithm(ComparisonAlgorithm):
17
+ """A base class for algorithms that compare left/right slices."""
18
+
19
+ def __init__(
20
+ self,
21
+ timestamps: List[int] | None = None,
22
+ match_string: str | None = None,
23
+ match_value: str | None = None,
24
+ minimum_count: int | None = None,
25
+ make_printable: bool = False,
26
+ match_expression: str | None = None,
27
+ ):
28
+ """Create a ComparisonAlgorithm."""
29
+ self.timestamps = timestamps
30
+ self.match_string = match_string
31
+ self.match_value = match_value
32
+ self.minimum_count = minimum_count
33
+ self.make_printable = make_printable
34
+ self.match_expression = (match_expression,)
35
+
36
+ def compare_two_dissections(
37
+ self, _left_side: Dissection, _right_side: Dissection
38
+ ) -> Comparison:
39
+ """Error catching base class function for comparing two alogirthms."""
40
+ error("code failure: base class compare_two_dissections should never be called")
41
+ raise ValueError
42
+
43
+ def compare_dissections(self, dissections: List[Dissection]) -> List[Comparison]:
44
+ """Compare all the dissections in slices."""
45
+ comparisons = []
46
+ # hack to figure out if there is at least two instances of a generator
47
+ # without actually extracting them all
48
+ # (since it could be memory expensive)
49
+
50
+ reference = next(dissections)
51
+ other = None
52
+ multiple = True
53
+ try:
54
+ other = next(dissections)
55
+ dissections = itertools.chain([other], dissections)
56
+ except Exception:
57
+ exception("failed to create a chain of dissections")
58
+ multiple = False
59
+
60
+ if multiple:
61
+ # multiple file comparison
62
+ for other in dissections:
63
+ # compare the two global summaries
64
+
65
+ comparison = self.compare_two_dissections(
66
+ reference.data[0], other.data[0]
67
+ )
68
+ comparison.title = f"{reference.pcap_file} vs {other.pcap_file}"
69
+
70
+ comparisons.append(comparison)
71
+ else:
72
+ # deal with timestamps within a single file
73
+ reference = reference.data
74
+ timestamps = list(reference.keys())
75
+ if len(timestamps) == 1: # just 0-summary plus a single stamp
76
+ error(
77
+ "the requested pcap data was not long enough to compare against itself"
78
+ )
79
+ errorstr: str = "not large enough pcap file"
80
+ raise ValueError(errorstr)
81
+ debug(
82
+ f"found {len(timestamps)} timestamps from {timestamps[2]} to {timestamps[-1]}"
83
+ )
84
+
85
+ for timestamp in range(
86
+ 2, len(timestamps)
87
+ ): # second real non-zero timestamp to last
88
+ time_left = timestamps[timestamp - 1]
89
+ time_right = timestamps[timestamp]
90
+
91
+ # see if we were asked to only use particular time ranges
92
+ # if self.between_times and (
93
+ # time_left < self.between_times[0]
94
+ # or time_right > self.between_times[1]
95
+ # ):
96
+ # continue
97
+
98
+ debug(f"comparing timestamps {time_left} and {time_right}")
99
+
100
+ comparison = self.compare_two_dissections(
101
+ reference[time_left],
102
+ reference[time_right],
103
+ )
104
+
105
+ title_left = dt.datetime.fromtimestamp(time_left, dt.UTC).strftime(
106
+ "%Y-%m-%d %H:%M:%S"
107
+ )
108
+ title_right = dt.datetime.fromtimestamp(time_right, dt.UTC).strftime(
109
+ "%Y-%m-%d %H:%M:%S"
110
+ )
111
+
112
+ comparison.title = f"time {title_left} vs time {title_right}"
113
+ comparisons.append(comparison)
114
+
115
+ # return our collected results
116
+ return comparisons
@@ -1,14 +1,14 @@
1
- from traffic_taffy.algorithms import ComparisonAlgorithm
1
+ from traffic_taffy.algorithms.compareslices import ComparisonSlicesAlgorithm
2
2
  from traffic_taffy.comparison import Comparison
3
3
  from traffic_taffy.dissection import Dissection
4
- from traffic_taffy.report import Report
4
+ from traffic_taffy.reports.compareslicesreport import CompareSlicesReport
5
5
 
6
6
 
7
- class ComparisonStatistical(ComparisonAlgorithm):
8
- def __init__(self):
9
- super().__init__()
7
+ class ComparisonStatistical(ComparisonSlicesAlgorithm):
8
+ def __init__(self, *args, **kwargs):
9
+ super().__init__(*args, **kwargs)
10
10
 
11
- def compare_dissections(self, left_side: dict, right_side: dict) -> Comparison:
11
+ def compare_two_dissections(self, left_side: dict, right_side: dict) -> Comparison:
12
12
  """Compare two dissections."""
13
13
  report = {}
14
14
 
@@ -46,7 +46,7 @@ class ComparisonStatistical(ComparisonAlgorithm):
46
46
  new_left_count += 1
47
47
 
48
48
  delta_absolute = right_count - left_count
49
- report[key][subkey] = Report(
49
+ report[key][subkey] = CompareSlicesReport(
50
50
  delta_percentage=delta_percentage,
51
51
  delta_absolute=delta_absolute,
52
52
  total=total,
@@ -67,7 +67,7 @@ class ComparisonStatistical(ComparisonAlgorithm):
67
67
  right_percentage = right_side[key][subkey] / right_side_total
68
68
  new_right_count += 1 # this value wasn't in the left
69
69
 
70
- report[key][subkey] = Report(
70
+ report[key][subkey] = CompareSlicesReport(
71
71
  delta_percentage=delta_percentage,
72
72
  delta_absolute=right_count,
73
73
  total=total,
@@ -87,7 +87,7 @@ class ComparisonStatistical(ComparisonAlgorithm):
87
87
  else:
88
88
  left_percent = new_left_count / left_side_total
89
89
 
90
- report[key][Dissection.NEW_RIGHT_SUBKEY] = Report(
90
+ report[key][Dissection.NEW_RIGHT_SUBKEY] = CompareSlicesReport(
91
91
  delta_absolute=new_right_count - new_left_count,
92
92
  total=new_left_count + new_right_count,
93
93
  left_count=new_left_count,