traffic-taffy 0.3.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. traffic_taffy/cache_info.py +0 -6
  2. traffic_taffy/compare.py +154 -250
  3. traffic_taffy/comparison.py +26 -0
  4. traffic_taffy/dissection.py +383 -0
  5. traffic_taffy/dissectmany.py +20 -18
  6. traffic_taffy/dissector.py +128 -476
  7. traffic_taffy/dissector_engine/__init__.py +35 -0
  8. traffic_taffy/dissector_engine/dpkt.py +98 -0
  9. traffic_taffy/dissector_engine/scapy.py +98 -0
  10. traffic_taffy/graph.py +23 -90
  11. traffic_taffy/graphdata.py +35 -20
  12. traffic_taffy/output/__init__.py +118 -0
  13. traffic_taffy/output/console.py +72 -0
  14. traffic_taffy/output/fsdb.py +50 -0
  15. traffic_taffy/output/memory.py +51 -0
  16. traffic_taffy/pcap_splitter.py +17 -36
  17. traffic_taffy/tools/cache_info.py +65 -0
  18. traffic_taffy/tools/compare.py +110 -0
  19. traffic_taffy/tools/dissect.py +77 -0
  20. traffic_taffy/tools/explore.py +686 -0
  21. traffic_taffy/tools/graph.py +85 -0
  22. {traffic_taffy-0.3.6.dist-info → traffic_taffy-0.4.1.dist-info}/METADATA +1 -1
  23. traffic_taffy-0.4.1.dist-info/RECORD +29 -0
  24. traffic_taffy-0.4.1.dist-info/entry_points.txt +6 -0
  25. pcap_compare/cache_info.py +0 -46
  26. pcap_compare/compare.py +0 -288
  27. pcap_compare/dissectmany.py +0 -21
  28. pcap_compare/dissector.py +0 -512
  29. pcap_compare/dissectorresults.py +0 -21
  30. pcap_compare/graph.py +0 -210
  31. traffic_taffy/explore.py +0 -221
  32. traffic_taffy-0.3.6.dist-info/RECORD +0 -22
  33. traffic_taffy-0.3.6.dist-info/entry_points.txt +0 -5
  34. {pcap_compare → traffic_taffy/tools}/__init__.py +0 -0
  35. {traffic_taffy-0.3.6.dist-info → traffic_taffy-0.4.1.dist-info}/WHEEL +0 -0
  36. {traffic_taffy-0.3.6.dist-info → traffic_taffy-0.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,383 @@
1
+ import os
2
+ from collections import defaultdict, Counter
3
+ from typing import Any
4
+ from logging import debug, info, error
5
+ from enum import Enum
6
+ import msgpack
7
+ import ipaddress
8
+ from typing import List
9
+ from copy import deepcopy
10
+
11
+
12
+ class PCAPDissectorLevel(Enum):
13
+ COUNT_ONLY = 1
14
+ THROUGH_IP = 2
15
+ DETAILED = 10
16
+
17
+
18
+ class Dissection:
19
+ DISSECTION_KEY: str = "PCAP_DISSECTION_VERSION"
20
+ DISSECTION_VERSION: int = 7
21
+
22
+ TOTAL_COUNT: str = "__TOTAL__"
23
+ TOTAL_SUBKEY: str = "packet"
24
+ WIDTH_SUBKEY: str = "__WIDTH__"
25
+ NEW_RIGHT_SUBKEY: str = "__NEW_VALUES__"
26
+
27
+ def __init__(
28
+ self,
29
+ pcap_file: str,
30
+ pcap_filter: str | None = None,
31
+ maximum_count: int = 0,
32
+ bin_size: int = 0,
33
+ dissector_level: PCAPDissectorLevel = PCAPDissectorLevel.DETAILED,
34
+ cache_file_suffix: str = "taffy",
35
+ ignore_list: list = [],
36
+ *args,
37
+ **kwargs,
38
+ ):
39
+ self.pcap_file = pcap_file
40
+ self.bin_size = bin_size
41
+ self.cache_file_suffix = cache_file_suffix
42
+ self._data = defaultdict(Dissection.subdict_producer)
43
+ self._timestamp = 0
44
+ self.dissector_level = dissector_level
45
+ self.maximum_count = maximum_count
46
+ self.pcap_filter = pcap_filter
47
+ self.ignore_list = ignore_list
48
+
49
+ self.parameters = [
50
+ "pcap_file",
51
+ "bin_size",
52
+ "dissector_level",
53
+ "pcap_filter",
54
+ "maximum_count",
55
+ "ignore_list",
56
+ ]
57
+ self.settable_from_cache = ["bin_size", "dissector_level", "maximum_count"]
58
+
59
+ def clone(self):
60
+ newd = Dissection(
61
+ self.pcap_file,
62
+ self.pcap_filter,
63
+ self.maximum_count,
64
+ self.bin_size,
65
+ self.dissector_level,
66
+ self.cache_file_suffix,
67
+ deepcopy(self.ignore_list),
68
+ )
69
+ newd.data = deepcopy(self.data)
70
+ newd.timestamp = self.timestamp
71
+ return newd
72
+
73
+ @property
74
+ def timestamp(self):
75
+ return self._timestamp
76
+
77
+ @timestamp.setter
78
+ def timestamp(self, newval):
79
+ self._timestamp = newval
80
+
81
+ @property
82
+ def data(self):
83
+ return self._data
84
+
85
+ @data.setter
86
+ def data(self, newval):
87
+ self._data = newval
88
+
89
+ @property
90
+ def pcap_file(self):
91
+ return self._pcap_file
92
+
93
+ @pcap_file.setter
94
+ def pcap_file(self, newval):
95
+ self._pcap_file = newval
96
+
97
+ def incr(self, key: str, value: Any, count: int = 1):
98
+ "increase one field within the counter"
99
+ # always save a total count at the zero bin
100
+ # note: there should be no recorded tcpdump files from 1970 Jan 01 :-)
101
+ self.data[0][key][value] += count
102
+ if self.timestamp:
103
+ if self.timestamp not in self.data:
104
+ self.data[self.timestamp] = defaultdict(Counter)
105
+ self.data[self.timestamp][key][value] += count
106
+
107
+ def calculate_metadata(self) -> None:
108
+ "Calculates things like the number of value entries within each key/subkey"
109
+ # TODO: do we do this with or without key and value matches?
110
+ for timestamp in self.data.keys():
111
+ for key in self.data[timestamp]:
112
+ if self.WIDTH_SUBKEY in self.data[timestamp][key]:
113
+ # make sure to avoid counting itself
114
+ del self.data[timestamp][key][self.WIDTH_SUBKEY]
115
+ self.data[timestamp][key][self.WIDTH_SUBKEY] = len(
116
+ self.data[timestamp][key]
117
+ )
118
+
119
+ if self.NEW_RIGHT_SUBKEY in self.data[timestamp][key]:
120
+ # don't count the NEW subkey either
121
+ self.data[timestamp][key] -= 1
122
+
123
+ def merge(self, other_dissection) -> None:
124
+ "merges counters in two dissections into self -- note destructive to self"
125
+ for timestamp in other_dissection.data:
126
+ for key in other_dissection.data[timestamp]:
127
+ for subkey in other_dissection.data[timestamp][key]:
128
+ # TODO: this is horribly inefficient
129
+ if timestamp not in self.data:
130
+ self.data[timestamp] = defaultdict(Counter)
131
+ elif key not in self.data[timestamp]:
132
+ self.data[timestamp][key] = Counter()
133
+ elif isinstance(self.data[timestamp][key], dict):
134
+ self.data[timestamp][key][subkey] = 0
135
+ self.data[timestamp][key][subkey] += other_dissection.data[
136
+ timestamp
137
+ ][key][subkey]
138
+
139
+ @staticmethod
140
+ def subdict_producer():
141
+ return defaultdict(Counter)
142
+
143
+ #
144
+ # Loading / Saving
145
+ #
146
+
147
+ def load_from_cache(self, force: bool = False) -> dict | None:
148
+ if not self.pcap_file or not isinstance(self.pcap_file, str):
149
+ return None
150
+ if not os.path.exists(self.pcap_file + self.cache_file_suffix):
151
+ return None
152
+
153
+ cached_file = self.pcap_file + self.cache_file_suffix
154
+ cached_contents = self.load_saved(cached_file, dont_overwrite=True)
155
+
156
+ ok_to_load = True
157
+
158
+ if cached_contents[self.DISSECTION_KEY] != self.DISSECTION_VERSION:
159
+ debug(
160
+ "dissection cache version ({cached_contents[self.DISSECTION_KEY]}) differs from code version {self.DISSECTION_VERSION}"
161
+ )
162
+ ok_to_load = False
163
+
164
+ # a zero really is a 1 since bin(0) still does int(timestamp)
165
+ if (
166
+ cached_contents["parameters"]["bin_size"] == 0
167
+ or cached_contents["parameters"]["bin_size"] is None
168
+ ):
169
+ cached_contents["parameters"]["bin_size"] = 1
170
+
171
+ for parameter in self.parameters:
172
+ specified = getattr(self, parameter)
173
+ cached = cached_contents["parameters"][parameter]
174
+
175
+ if not specified and parameter in self.settable_from_cache:
176
+ # inherit from the cache
177
+ setattr(self, parameter, cached)
178
+ continue
179
+
180
+ if specified and specified != cached:
181
+ # special checks for certain types of parameters:
182
+
183
+ if parameter == "dissector_level":
184
+ debug("------------ here 1")
185
+ if parameter == "dissector_level" and specified <= cached:
186
+ debug(f"here with dissector_level {specified} and {cached}")
187
+ # loading a more detailed cache is ok
188
+ continue
189
+
190
+ if parameter == "pcap_file" and os.path.basename(
191
+ specified
192
+ ) == os.path.basename(cached):
193
+ # as long as the basename is ok, we'll assume it's a different path
194
+ # TODO: only store basename?
195
+ continue
196
+
197
+ debug(
198
+ f"parameter {parameter} doesn't match: specified={specified} != cached={cached}"
199
+ )
200
+ ok_to_load = False
201
+
202
+ if ok_to_load:
203
+ info(f"loading cached pcap contents from {cached_file}")
204
+ self.load_saved_contents(cached_contents)
205
+ return self
206
+
207
+ if force:
208
+ info("forced continuing without loading the cache")
209
+ return None
210
+
211
+ error(f"Failed to load cached data for {self.pcap_file} due to differences")
212
+ error("refusing to continue -- remove the cache to recreate it")
213
+ raise ValueError(
214
+ "INCOMPATIBLE CACHE: remove the cache or don't use it to continue"
215
+ )
216
+
217
+ def save_to_cache(self, where: str | None = None) -> None:
218
+ if not where and self.pcap_file and isinstance(self.pcap_file, str):
219
+ where = self.pcap_file + self.cache_file_suffix
220
+ if where:
221
+ self.save(where)
222
+
223
+ def save(self, where: str) -> None:
224
+ "Saves a generated dissection to a msgpack file"
225
+
226
+ # wrap the report in a version header
227
+ versioned_cache = {
228
+ self.DISSECTION_KEY: self.DISSECTION_VERSION,
229
+ "file": self.pcap_file,
230
+ "parameters": {},
231
+ "dissection": self.data,
232
+ }
233
+
234
+ for parameter in self.parameters:
235
+ versioned_cache["parameters"][parameter] = getattr(self, parameter)
236
+ # TODO: fix this hack
237
+
238
+ # basically, bin_size of 0 is 1... but it may be faster
239
+ # to leave it at zero to avoid the bin_size math of 1,
240
+ # which is actually a math noop that will still consume
241
+ # cycles. We save it as 1 though since the math is past
242
+ # us and a 1 value is more informative to the user.
243
+ if parameter == "bin_size" and self.bin_size == 0:
244
+ versioned_cache["parameters"][parameter] = 1
245
+
246
+ if parameter == "dissector_level" and isinstance(
247
+ versioned_cache["parameters"][parameter], PCAPDissectorLevel
248
+ ):
249
+ versioned_cache["parameters"][parameter] = versioned_cache[
250
+ "parameters"
251
+ ][parameter].value
252
+
253
+ # msgpack can't store sets
254
+ versioned_cache["parameters"]["ignore_list"] = list(
255
+ versioned_cache["parameters"]["ignore_list"]
256
+ )
257
+
258
+ # save it
259
+ info(f"caching PCAP data to '{where}'")
260
+ msgpack.dump(versioned_cache, open(where, "wb"))
261
+
262
+ def load_saved_contents(self, versioned_cache):
263
+ # set the local parameters from the cache
264
+ for parameter in self.parameters:
265
+ setattr(self, parameter, versioned_cache["parameters"][parameter])
266
+
267
+ # load the data
268
+ self.data = versioned_cache["dissection"]
269
+
270
+ def load_saved(self, where: str, dont_overwrite: bool = False) -> dict:
271
+ "Loads a previous saved report from a file instead of re-parsing pcaps"
272
+ contents = msgpack.load(open(where, "rb"), strict_map_key=False)
273
+
274
+ # convert the ignore list to a set (msgpack doesn't do sets)
275
+ contents["parameters"]["ignore_list"] = set(
276
+ contents["parameters"]["ignore_list"]
277
+ )
278
+
279
+ # check that the version header matches something we understand
280
+ if contents[self.DISSECTION_KEY] != self.DISSECTION_VERSION:
281
+ raise ValueError(
282
+ "improper saved dissection version: report version = "
283
+ + str(contents[self.DISSECTION_KEY])
284
+ + ", our version: "
285
+ + str(self.DISSECTION_VERSION)
286
+ )
287
+
288
+ if not dont_overwrite:
289
+ self.load_saved_contents(contents)
290
+
291
+ return contents
292
+
293
+ def find_data(
294
+ self,
295
+ timestamps: List[int] | None = None,
296
+ match_string: str | None = None,
297
+ match_value: str | None = None,
298
+ minimum_count: int | None = None,
299
+ make_printable: bool = False,
300
+ ):
301
+ data = self.data
302
+
303
+ if not timestamps:
304
+ timestamps = data.keys()
305
+
306
+ # find timestamps/key values with at least one item above count
307
+ # TODO: we should really use pandas for this
308
+ usable = defaultdict(set)
309
+ for timestamp in timestamps:
310
+ for key in data[timestamp]:
311
+ # if they requested a match string
312
+ if match_string and match_string not in key:
313
+ continue
314
+
315
+ # ensure at least one of the count valuse for the
316
+ # stream gets above minimum_count
317
+ for subkey, count in data[timestamp][key].items():
318
+ if (
319
+ not minimum_count
320
+ or minimum_count
321
+ and abs(count) > minimum_count
322
+ ):
323
+ usable[key].add(subkey)
324
+
325
+ # TODO: move the timestamp inside the other fors for faster
326
+ # processing of skipped key/subkeys
327
+ for timestamp in timestamps:
328
+ for key in sorted(data[timestamp]):
329
+ if key not in usable:
330
+ continue
331
+
332
+ for subkey, count in sorted(
333
+ data[timestamp][key].items(), key=lambda x: x[1], reverse=True
334
+ ):
335
+ # check that this subkey can be usable at all
336
+ if subkey not in usable[key]:
337
+ continue
338
+
339
+ if make_printable:
340
+ subkey = Dissection.make_printable(key, subkey)
341
+ count = Dissection.make_printable(None, count)
342
+
343
+ if match_value and match_value not in subkey:
344
+ continue
345
+
346
+ yield (timestamp, key, subkey, count)
347
+
348
+ @staticmethod
349
+ def make_printable(value_type: str, value: Any) -> str:
350
+ try:
351
+ if isinstance(value, bytes):
352
+ if value_type in Dissection.display_transformers:
353
+ value = str(Dissection.display_transformers[value_type](value))
354
+ else:
355
+ value = "0x" + value.hex()
356
+ else:
357
+ value = str(value)
358
+ except Exception:
359
+ if isinstance(value, bytes):
360
+ value = "0x" + value.hex()
361
+ else:
362
+ value = "[unprintable]"
363
+ if len(value) > 40:
364
+ value = value[0:40] + "..." # truncate to reasonable
365
+ return value
366
+
367
+ def print_mac_address(value):
368
+ "Converts bytes to ethernet mac style address"
369
+
370
+ # TODO: certainly inefficient
371
+ def two_hex(value):
372
+ return f"{value:02x}"
373
+
374
+ return ":".join(map(two_hex, value))
375
+
376
+ display_transformers = {
377
+ "Ethernet.IP.src": ipaddress.ip_address,
378
+ "Ethernet.IP.dst": ipaddress.ip_address,
379
+ "Ethernet.IP6.src": ipaddress.ip_address,
380
+ "Ethernet.IP6.dst": ipaddress.ip_address,
381
+ "Ethernet.src": print_mac_address,
382
+ "Ethernet.dst": print_mac_address,
383
+ }
@@ -1,4 +1,4 @@
1
- from traffic_taffy.dissector import PCAPDissector, pcap_data_merge
1
+ from traffic_taffy.dissector import PCAPDissector
2
2
  from pcap_parallel import PCAPParallel
3
3
  from concurrent.futures import ProcessPoolExecutor
4
4
  from logging import info
@@ -30,17 +30,19 @@ class PCAPDissectMany:
30
30
  **self.kwargs,
31
31
  )
32
32
  pd.load()
33
- return pd.data
33
+ return pd.dissection
34
34
 
35
- def load_pcap(self, pcap_file, split_size=None, maximum_count=0):
35
+ def load_pcap(
36
+ self, pcap_file, split_size=None, maximum_count: int = 0, force: bool = False
37
+ ):
36
38
  pd = PCAPDissector(
37
39
  pcap_file,
38
40
  *self.args,
39
41
  **self.kwargs,
40
42
  )
41
- data = pd.load_from_cache()
42
- if data:
43
- return {"file": pcap_file, "data": data}
43
+ dissection = pd.load_from_cache(force=force)
44
+ if dissection:
45
+ return dissection
44
46
 
45
47
  # TODO: check caching availability here
46
48
  info(f"processing {pcap_file}")
@@ -53,26 +55,26 @@ class PCAPDissectMany:
53
55
  )
54
56
  results = ps.split()
55
57
 
56
- data = results.pop(0).result()
58
+ # the data is coming back in (likely overlapping) chunks, and
59
+ # we need to merge them together
60
+ dissection = results.pop(0).result()
61
+ dissection.pcap_file = pcap_file # splitting has the wrong name
57
62
  for result in results:
58
- data = pcap_data_merge(data, result.result())
63
+ dissection.merge(result.result())
59
64
 
60
- PCAPDissector.calculate_metadata(data)
65
+ dissection.calculate_metadata()
61
66
 
62
67
  if self.kwargs.get("cache_results"):
63
68
  # create a dissector just to save the cache
64
69
  # (we don't call load())
65
- pd = PCAPDissector(
66
- pcap_file,
67
- *self.args,
68
- **self.kwargs,
70
+ dissection.pcap_file = pcap_file
71
+ dissection.save_to_cache(
72
+ pcap_file + "." + self.kwargs.get("cache_file_suffix", "taffy")
69
73
  )
70
- pd.data = data
71
- pd.save(pcap_file + ".pkl")
72
74
 
73
- return {"file": pcap_file, "data": data}
75
+ return dissection
74
76
 
75
77
  def load_all(self):
76
78
  with ProcessPoolExecutor() as executor:
77
- results = executor.map(self.load_pcap, self.pcap_files)
78
- return results
79
+ dissections = executor.map(self.load_pcap, self.pcap_files)
80
+ return dissections