PyPI - traffic-taffy - Versions diffs - 0.3.6__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

traffic-taffy 0.3.6py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

traffic_taffy/cache_info.py +0 -6
traffic_taffy/compare.py +154 -250
traffic_taffy/comparison.py +26 -0
traffic_taffy/dissection.py +383 -0
traffic_taffy/dissectmany.py +20 -18
traffic_taffy/dissector.py +128 -476
traffic_taffy/dissector_engine/__init__.py +35 -0
traffic_taffy/dissector_engine/dpkt.py +98 -0
traffic_taffy/dissector_engine/scapy.py +98 -0
traffic_taffy/graph.py +23 -90
traffic_taffy/graphdata.py +35 -20
traffic_taffy/output/__init__.py +118 -0
traffic_taffy/output/console.py +72 -0
traffic_taffy/output/fsdb.py +50 -0
traffic_taffy/output/memory.py +51 -0
traffic_taffy/pcap_splitter.py +17 -36
traffic_taffy/tools/cache_info.py +65 -0
traffic_taffy/tools/compare.py +110 -0
traffic_taffy/tools/dissect.py +77 -0
traffic_taffy/tools/explore.py +686 -0
traffic_taffy/tools/graph.py +85 -0
{traffic_taffy-0.3.6.dist-info → traffic_taffy-0.4.1.dist-info}/METADATA +1 -1
traffic_taffy-0.4.1.dist-info/RECORD +29 -0
traffic_taffy-0.4.1.dist-info/entry_points.txt +6 -0
pcap_compare/cache_info.py +0 -46
pcap_compare/compare.py +0 -288
pcap_compare/dissectmany.py +0 -21
pcap_compare/dissector.py +0 -512
pcap_compare/dissectorresults.py +0 -21
pcap_compare/graph.py +0 -210
traffic_taffy/explore.py +0 -221
traffic_taffy-0.3.6.dist-info/RECORD +0 -22
traffic_taffy-0.3.6.dist-info/entry_points.txt +0 -5
{pcap_compare → traffic_taffy/tools}/__init__.py +0 -0
{traffic_taffy-0.3.6.dist-info → traffic_taffy-0.4.1.dist-info}/WHEEL +0 -0
{traffic_taffy-0.3.6.dist-info → traffic_taffy-0.4.1.dist-info}/top_level.txt +0 -0

traffic_taffy/dissection.py ADDED Viewed

@@ -0,0 +1,383 @@
+import os
+from collections import defaultdict, Counter
+from typing import Any
+from logging import debug, info, error
+from enum import Enum
+import msgpack
+import ipaddress
+from typing import List
+from copy import deepcopy
+class PCAPDissectorLevel(Enum):
+    COUNT_ONLY = 1
+    THROUGH_IP = 2
+    DETAILED = 10
+class Dissection:
+    DISSECTION_KEY: str = "PCAP_DISSECTION_VERSION"
+    DISSECTION_VERSION: int = 7
+    TOTAL_COUNT: str = "__TOTAL__"
+    TOTAL_SUBKEY: str = "packet"
+    WIDTH_SUBKEY: str = "__WIDTH__"
+    NEW_RIGHT_SUBKEY: str = "__NEW_VALUES__"
+    def __init__(
+        self,
+        pcap_file: str,
+        pcap_filter: str | None = None,
+        maximum_count: int = 0,
+        bin_size: int = 0,
+        dissector_level: PCAPDissectorLevel = PCAPDissectorLevel.DETAILED,
+        cache_file_suffix: str = "taffy",
+        ignore_list: list = [],
+        *args,
+        **kwargs,
+    ):
+        self.pcap_file = pcap_file
+        self.bin_size = bin_size
+        self.cache_file_suffix = cache_file_suffix
+        self._data = defaultdict(Dissection.subdict_producer)
+        self._timestamp = 0
+        self.dissector_level = dissector_level
+        self.maximum_count = maximum_count
+        self.pcap_filter = pcap_filter
+        self.ignore_list = ignore_list
+        self.parameters = [
+            "pcap_file",
+            "bin_size",
+            "dissector_level",
+            "pcap_filter",
+            "maximum_count",
+            "ignore_list",
+        ]
+        self.settable_from_cache = ["bin_size", "dissector_level", "maximum_count"]
+    def clone(self):
+        newd = Dissection(
+            self.pcap_file,
+            self.pcap_filter,
+            self.maximum_count,
+            self.bin_size,
+            self.dissector_level,
+            self.cache_file_suffix,
+            deepcopy(self.ignore_list),
+        )
+        newd.data = deepcopy(self.data)
+        newd.timestamp = self.timestamp
+        return newd
+    @property
+    def timestamp(self):
+        return self._timestamp
+    @timestamp.setter
+    def timestamp(self, newval):
+        self._timestamp = newval
+    @property
+    def data(self):
+        return self._data
+    @data.setter
+    def data(self, newval):
+        self._data = newval
+    @property
+    def pcap_file(self):
+        return self._pcap_file
+    @pcap_file.setter
+    def pcap_file(self, newval):
+        self._pcap_file = newval
+    def incr(self, key: str, value: Any, count: int = 1):
+        "increase one field within the counter"
+        # always save a total count at the zero bin
+        # note: there should be no recorded tcpdump files from 1970 Jan 01 :-)
+        self.data[0][key][value] += count
+        if self.timestamp:
+            if self.timestamp not in self.data:
+                self.data[self.timestamp] = defaultdict(Counter)
+            self.data[self.timestamp][key][value] += count
+    def calculate_metadata(self) -> None:
+        "Calculates things like the number of value entries within each key/subkey"
+        # TODO: do we do this with or without key and value matches?
+        for timestamp in self.data.keys():
+            for key in self.data[timestamp]:
+                if self.WIDTH_SUBKEY in self.data[timestamp][key]:
+                    # make sure to avoid counting itself
+                    del self.data[timestamp][key][self.WIDTH_SUBKEY]
+                self.data[timestamp][key][self.WIDTH_SUBKEY] = len(
+                    self.data[timestamp][key]
+                )
+                if self.NEW_RIGHT_SUBKEY in self.data[timestamp][key]:
+                    # don't count the NEW subkey either
+                    self.data[timestamp][key] -= 1
+    def merge(self, other_dissection) -> None:
+        "merges counters in two dissections into self -- note destructive to self"
+        for timestamp in other_dissection.data:
+            for key in other_dissection.data[timestamp]:
+                for subkey in other_dissection.data[timestamp][key]:
+                    # TODO: this is horribly inefficient
+                    if timestamp not in self.data:
+                        self.data[timestamp] = defaultdict(Counter)
+                    elif key not in self.data[timestamp]:
+                        self.data[timestamp][key] = Counter()
+                    elif isinstance(self.data[timestamp][key], dict):
+                        self.data[timestamp][key][subkey] = 0
+                    self.data[timestamp][key][subkey] += other_dissection.data[
+                        timestamp
+                    ][key][subkey]
+    @staticmethod
+    def subdict_producer():
+        return defaultdict(Counter)
+    #
+    # Loading / Saving
+    #
+    def load_from_cache(self, force: bool = False) -> dict | None:
+        if not self.pcap_file or not isinstance(self.pcap_file, str):
+            return None
+        if not os.path.exists(self.pcap_file + self.cache_file_suffix):
+            return None
+        cached_file = self.pcap_file + self.cache_file_suffix
+        cached_contents = self.load_saved(cached_file, dont_overwrite=True)
+        ok_to_load = True
+        if cached_contents[self.DISSECTION_KEY] != self.DISSECTION_VERSION:
+            debug(
+                "dissection cache version ({cached_contents[self.DISSECTION_KEY]}) differs from code version {self.DISSECTION_VERSION}"
+            )
+            ok_to_load = False
+        # a zero really is a 1 since bin(0) still does int(timestamp)
+        if (
+            cached_contents["parameters"]["bin_size"] == 0
+            or cached_contents["parameters"]["bin_size"] is None
+        ):
+            cached_contents["parameters"]["bin_size"] = 1
+        for parameter in self.parameters:
+            specified = getattr(self, parameter)
+            cached = cached_contents["parameters"][parameter]
+            if not specified and parameter in self.settable_from_cache:
+                # inherit from the cache
+                setattr(self, parameter, cached)
+                continue
+            if specified and specified != cached:
+                # special checks for certain types of parameters:
+                if parameter == "dissector_level":
+                    debug("------------ here 1")
+                if parameter == "dissector_level" and specified <= cached:
+                    debug(f"here with dissector_level {specified} and {cached}")
+                    # loading a more detailed cache is ok
+                    continue
+                if parameter == "pcap_file" and os.path.basename(
+                    specified
+                ) == os.path.basename(cached):
+                    # as long as the basename is ok, we'll assume it's a different path
+                    # TODO: only store basename?
+                    continue
+                debug(
+                    f"parameter {parameter} doesn't match: specified={specified} != cached={cached}"
+                )
+                ok_to_load = False
+        if ok_to_load:
+            info(f"loading cached pcap contents from {cached_file}")
+            self.load_saved_contents(cached_contents)
+            return self
+        if force:
+            info("forced continuing without loading the cache")
+            return None
+        error(f"Failed to load cached data for {self.pcap_file} due to differences")
+        error("refusing to continue -- remove the cache to recreate it")
+        raise ValueError(
+            "INCOMPATIBLE CACHE: remove the cache or don't use it to continue"
+        )
+    def save_to_cache(self, where: str | None = None) -> None:
+        if not where and self.pcap_file and isinstance(self.pcap_file, str):
+            where = self.pcap_file + self.cache_file_suffix
+        if where:
+            self.save(where)
+    def save(self, where: str) -> None:
+        "Saves a generated dissection to a msgpack file"
+        # wrap the report in a version header
+        versioned_cache = {
+            self.DISSECTION_KEY: self.DISSECTION_VERSION,
+            "file": self.pcap_file,
+            "parameters": {},
+            "dissection": self.data,
+        }
+        for parameter in self.parameters:
+            versioned_cache["parameters"][parameter] = getattr(self, parameter)
+            # TODO: fix this hack
+            # basically, bin_size of 0 is 1...  but it may be faster
+            # to leave it at zero to avoid the bin_size math of 1,
+            # which is actually a math noop that will still consume
+            # cycles.  We save it as 1 though since the math is past
+            # us and a 1 value is more informative to the user.
+            if parameter == "bin_size" and self.bin_size == 0:
+                versioned_cache["parameters"][parameter] = 1
+            if parameter == "dissector_level" and isinstance(
+                versioned_cache["parameters"][parameter], PCAPDissectorLevel
+            ):
+                versioned_cache["parameters"][parameter] = versioned_cache[
+                    "parameters"
+                ][parameter].value
+        # msgpack can't store sets
+        versioned_cache["parameters"]["ignore_list"] = list(
+            versioned_cache["parameters"]["ignore_list"]
+        )
+        # save it
+        info(f"caching PCAP data to '{where}'")
+        msgpack.dump(versioned_cache, open(where, "wb"))
+    def load_saved_contents(self, versioned_cache):
+        # set the local parameters from the cache
+        for parameter in self.parameters:
+            setattr(self, parameter, versioned_cache["parameters"][parameter])
+        # load the data
+        self.data = versioned_cache["dissection"]
+    def load_saved(self, where: str, dont_overwrite: bool = False) -> dict:
+        "Loads a previous saved report from a file instead of re-parsing pcaps"
+        contents = msgpack.load(open(where, "rb"), strict_map_key=False)
+        # convert the ignore list to a set (msgpack doesn't do sets)
+        contents["parameters"]["ignore_list"] = set(
+            contents["parameters"]["ignore_list"]
+        )
+        # check that the version header matches something we understand
+        if contents[self.DISSECTION_KEY] != self.DISSECTION_VERSION:
+            raise ValueError(
+                "improper saved dissection version: report version = "
+                + str(contents[self.DISSECTION_KEY])
+                + ", our version: "
+                + str(self.DISSECTION_VERSION)
+            )
+        if not dont_overwrite:
+            self.load_saved_contents(contents)
+        return contents
+    def find_data(
+        self,
+        timestamps: List[int] | None = None,
+        match_string: str | None = None,
+        match_value: str | None = None,
+        minimum_count: int | None = None,
+        make_printable: bool = False,
+    ):
+        data = self.data
+        if not timestamps:
+            timestamps = data.keys()
+        # find timestamps/key values with at least one item above count
+        # TODO: we should really use pandas for this
+        usable = defaultdict(set)
+        for timestamp in timestamps:
+            for key in data[timestamp]:
+                # if they requested a match string
+                if match_string and match_string not in key:
+                    continue
+                # ensure at least one of the count valuse for the
+                # stream gets above minimum_count
+                for subkey, count in data[timestamp][key].items():
+                    if (
+                        not minimum_count
+                        or minimum_count
+                        and abs(count) > minimum_count
+                    ):
+                        usable[key].add(subkey)
+        # TODO: move the timestamp inside the other fors for faster
+        # processing of skipped key/subkeys
+        for timestamp in timestamps:
+            for key in sorted(data[timestamp]):
+                if key not in usable:
+                    continue
+                for subkey, count in sorted(
+                    data[timestamp][key].items(), key=lambda x: x[1], reverse=True
+                ):
+                    # check that this subkey can be usable at all
+                    if subkey not in usable[key]:
+                        continue
+                    if make_printable:
+                        subkey = Dissection.make_printable(key, subkey)
+                        count = Dissection.make_printable(None, count)
+                    if match_value and match_value not in subkey:
+                        continue
+                    yield (timestamp, key, subkey, count)
+    @staticmethod
+    def make_printable(value_type: str, value: Any) -> str:
+        try:
+            if isinstance(value, bytes):
+                if value_type in Dissection.display_transformers:
+                    value = str(Dissection.display_transformers[value_type](value))
+                else:
+                    value = "0x" + value.hex()
+            else:
+                value = str(value)
+        except Exception:
+            if isinstance(value, bytes):
+                value = "0x" + value.hex()
+            else:
+                value = "[unprintable]"
+        if len(value) > 40:
+            value = value[0:40] + "..."  # truncate to reasonable
+        return value
+    def print_mac_address(value):
+        "Converts bytes to ethernet mac style address"
+        # TODO: certainly inefficient
+        def two_hex(value):
+            return f"{value:02x}"
+        return ":".join(map(two_hex, value))
+    display_transformers = {
+        "Ethernet.IP.src": ipaddress.ip_address,
+        "Ethernet.IP.dst": ipaddress.ip_address,
+        "Ethernet.IP6.src": ipaddress.ip_address,
+        "Ethernet.IP6.dst": ipaddress.ip_address,
+        "Ethernet.src": print_mac_address,
+        "Ethernet.dst": print_mac_address,
+    }

traffic_taffy/dissectmany.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from traffic_taffy.dissector import PCAPDissector, pcap_data_merge
+from traffic_taffy.dissector import PCAPDissector
 from pcap_parallel import PCAPParallel
 from concurrent.futures import ProcessPoolExecutor
 from logging import info
@@ -30,17 +30,19 @@ class PCAPDissectMany:
             **self.kwargs,
         )
         pd.load()
-        return pd.data
+        return pd.dissection
-    def load_pcap(self, pcap_file, split_size=None, maximum_count=0):
+    def load_pcap(
+        self, pcap_file, split_size=None, maximum_count: int = 0, force: bool = False
+    ):
         pd = PCAPDissector(
             pcap_file,
             *self.args,
             **self.kwargs,
         )
-        data = pd.load_from_cache()
-        if data:
-            return {"file": pcap_file, "data": data}
+        dissection = pd.load_from_cache(force=force)
+        if dissection:
+            return dissection
         # TODO: check caching availability here
         info(f"processing {pcap_file}")
@@ -53,26 +55,26 @@ class PCAPDissectMany:
         )
         results = ps.split()
-        data = results.pop(0).result()
+        # the data is coming back in (likely overlapping) chunks, and
+        # we need to merge them together
+        dissection = results.pop(0).result()
+        dissection.pcap_file = pcap_file  # splitting has the wrong name
         for result in results:
-            data = pcap_data_merge(data, result.result())
+            dissection.merge(result.result())
-        PCAPDissector.calculate_metadata(data)
+        dissection.calculate_metadata()
         if self.kwargs.get("cache_results"):
             # create a dissector just to save the cache
             # (we don't call load())
-            pd = PCAPDissector(
-                pcap_file,
-                *self.args,
-                **self.kwargs,
+            dissection.pcap_file = pcap_file
+            dissection.save_to_cache(
+                pcap_file + "." + self.kwargs.get("cache_file_suffix", "taffy")
             )
-            pd.data = data
-            pd.save(pcap_file + ".pkl")
-        return {"file": pcap_file, "data": data}
+        return dissection
     def load_all(self):
         with ProcessPoolExecutor() as executor:
-            results = executor.map(self.load_pcap, self.pcap_files)
-            return results
+            dissections = executor.map(self.load_pcap, self.pcap_files)
+            return dissections

traffic-taffy 0.3.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

traffic-taffy 0.3.6py3-none-any.whl → 0.4.1py3-none-any.whl