PyPI - flow.record - Versions diffs - 3.15.dev14__tar.gz → 3.15.dev16__tar.gz - Mend

flow.record 3.15.dev14tar.gz → 3.15.dev16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

{flow.record-3.15.dev14/flow.record.egg-info → flow_record-3.15.dev16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flow.record
-Version: 3.15.dev14
+Version: 3.15.dev16
 Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
 Author-email: Dissect Team <dissect@fox-it.com>
 License: Affero General Public License v3
@@ -35,6 +35,8 @@ Requires-Dist: fastavro[snappy]; extra == "avro"
 Provides-Extra: duckdb
 Requires-Dist: duckdb; extra == "duckdb"
 Requires-Dist: pytz; extra == "duckdb"
+Provides-Extra: splunk
+Requires-Dist: httpx; extra == "splunk"
 Provides-Extra: test
 Requires-Dist: flow.record[compression]; extra == "test"
 Requires-Dist: flow.record[avro]; extra == "test"

flow_record-3.15.dev16/flow/record/adapter/splunk.py ADDED Viewed

@@ -0,0 +1,282 @@
+import json
+import logging
+import socket
+import uuid
+from datetime import datetime
+from enum import Enum
+from typing import Optional
+from urllib.parse import urlparse
+try:
+    import httpx
+    HAS_HTTPX = True
+except ImportError:
+    HAS_HTTPX = False
+from flow.record.adapter import AbstractReader, AbstractWriter
+from flow.record.base import Record
+from flow.record.jsonpacker import JsonRecordPacker
+from flow.record.utils import to_base64, to_bytes, to_str
+__usage__ = """
+Splunk output adapter (writer only)
+---
+Write usage: rdump -w splunk+[PROTOCOL]://[IP]:[PORT]?tag=[TAG]&token=[TOKEN]&sourcetype=[SOURCETYPE]
+[PROTOCOL]: Protocol to use for forwarding data. Can be tcp, http or https, defaults to tcp if omitted.
+[IP]:[PORT]: ip and port to a splunk instance
+[TAG]: optional value to add as "rdtag" output field when writing
+[TOKEN]: Authentication token for sending data over HTTP(S)
+[SOURCETYPE]: Set sourcetype of data. Defaults to records, but can also be set to JSON.
+[SSL_VERIFY]: Whether to verify the server certificate when sending data over HTTP(S). Defaults to True.
+"""
+log = logging.getLogger(__package__)
+# Amount of records to bundle into a single request when sending data over HTTP(S).
+RECORD_BUFFER_LIMIT = 20
+# https://docs.splunk.com/Documentation/Splunk/7.3.1/Data/Configureindex-timefieldextraction
+RESERVED_SPLUNK_FIELDS = [
+    "_indextime",
+    "_time",
+    "index",
+    "punct",
+    "source",
+    "sourcetype",
+    "tag",
+    "type",
+]
+RESERVED_RECORD_FIELDS = ["_classification", "_generated", "_source"]
+PREFIX_WITH_RD = set(RESERVED_SPLUNK_FIELDS + RESERVED_RECORD_FIELDS)
+class Protocol(Enum):
+    HTTP = "http"
+    HTTPS = "https"
+    TCP = "tcp"
+class SourceType(Enum):
+    JSON = "json"
+    RECORDS = "records"
+def splunkify_key_value(record: Record, tag: Optional[str] = None) -> str:
+    ret = []
+    ret.append(f'rdtype="{record._desc.name}"')
+    if tag is None:
+        ret.append("rdtag=None")
+    else:
+        ret.append(f'rdtag="{tag}"')
+    for field in record._desc.get_all_fields():
+        # Omit the _version field as the Splunk adapter has no reader support for deserialising records back.
+        if field == "_version":
+            continue
+        val = getattr(record, field)
+        if field in PREFIX_WITH_RD:
+            field = f"rd_{field}"
+        if val is None:
+            ret.append(f"{field}=None")
+        else:
+            val = to_base64(val) if isinstance(val, bytes) else to_str(val)
+            val = val.replace("\\", "\\\\").replace('"', '\\"')
+            ret.append(f'{field}="{val}"')
+    return " ".join(ret)
+def splunkify_json(packer: JsonRecordPacker, record: Record, tag: Optional[str] = None) -> str:
+    ret = {}
+    indexer_fields = [
+        ("host", "host"),
+        ("host", "hostname"),
+        ("time", "ts"),
+    ]
+    # When converting a record to json text for splunk, we distinguish between the 'event' (containing the data) and a
+    # few other fields that are splunk-specific for indexing. We add those 'indexer_fields' to the return object first.
+    for splunk_name, field_name in indexer_fields:
+        if hasattr(record, field_name):
+            val = getattr(record, field_name)
+            if val:
+                if isinstance(val, datetime):
+                    # Convert datetime objects to epoch timestamp for reserved fields.
+                    ret[splunk_name] = val.timestamp()
+                    continue
+                ret[splunk_name] = to_str(val)
+    record_as_dict = packer.pack_obj(record)
+    # Omit the _version field as the Splunk adapter has no reader support for deserialising records back.
+    del record_as_dict["_version"]
+    # These fields end up in the 'event', but we have a few reserved field names. If those field names are in the
+    # record, we prefix them with 'rd_' (short for record descriptor)
+    for field in PREFIX_WITH_RD:
+        if field not in record_as_dict:
+            continue
+        new_field = f"rd_{field}"
+        record_as_dict[new_field] = record_as_dict[field]
+        del record_as_dict[field]
+    # Almost done, just have to add the tag and the type (i.e the record descriptor's name) to the event.
+    record_as_dict["rdtag"] = tag
+    # Yes.
+    record_as_dict["rdtype"] = record._desc.name
+    ret["event"] = record_as_dict
+    return json.dumps(ret, default=packer.pack_obj)
+class SplunkWriter(AbstractWriter):
+    sock = None
+    session = None
+    def __init__(
+        self,
+        uri: str,
+        tag: Optional[str] = None,
+        token: Optional[str] = None,
+        sourcetype: Optional[str] = None,
+        ssl_verify: bool = True,
+        **kwargs,
+    ):
+        # If the writer is initiated without a protocol, we assume we will be writing over tcp
+        if "://" not in uri:
+            uri = f"tcp://{uri}"
+        if sourcetype is None:
+            log.warning("No sourcetype provided, assuming 'records' sourcetype")
+            sourcetype = SourceType.RECORDS
+        parsed_url = urlparse(uri)
+        url_scheme = parsed_url.scheme.lower()
+        self.sourcetype = SourceType(sourcetype)
+        self.protocol = Protocol(url_scheme)
+        if self.protocol == Protocol.TCP and self.sourcetype != SourceType.RECORDS:
+            raise ValueError("For sending data to Splunk over TCP, only the 'records' sourcetype is allowed")
+        self.host = parsed_url.hostname
+        self.port = parsed_url.port
+        self.tag = tag
+        self.record_buffer = []
+        self._warned = False
+        self.packer = None
+        if self.sourcetype == SourceType.JSON:
+            self.packer = JsonRecordPacker(indent=4, pack_descriptors=False)
+        if self.protocol == Protocol.TCP:
+            self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.SOL_TCP)
+            self.sock.connect((self.host, self.port))
+            self._send = self._send_tcp
+        elif self.protocol in (Protocol.HTTP, Protocol.HTTPS):
+            if not HAS_HTTPX:
+                raise ImportError("The httpx library is required for sending data over HTTP(S)")
+            scheme = self.protocol.value
+            self.token = token
+            if not self.token:
+                raise ValueError("An authorization token is required for the HTTP collector")
+            if not self.token.startswith("Splunk "):
+                self.token = f"Splunk {self.token}"
+            # Assume verify=True unless specified otherwise.
+            self.verify = str(ssl_verify).lower() not in ("0", "false")
+            if not self.verify:
+                log.warning("Certificate verification is disabled")
+            endpoint = "event" if self.sourcetype != SourceType.RECORDS else "raw"
+            port = f":{self.port}" if self.port else ""
+            self.url = f"{scheme}://{self.host}{port}/services/collector/{endpoint}?auto_extract_timestamp=true"
+            self.headers = {
+                "Authorization": self.token,
+                # A randomized value so that Splunk can loadbalance between different incoming datastreams
+                "X-Splunk-Request-Channel": str(uuid.uuid4()),
+            }
+            self.session = httpx.Client(verify=self.verify, headers=self.headers)
+            self._send = self._send_http
+    def _cache_records_for_http(self, data: Optional[bytes] = None, flush: bool = False) -> Optional[bytes]:
+        # It's possible to call this function without any data, purely to flush. Hence this check.
+        if data:
+            self.record_buffer.append(data)
+        if len(self.record_buffer) < RECORD_BUFFER_LIMIT and not flush:
+            # Buffer limit not exceeded yet, so we do not return a buffer yet, unless buffer is explicitly flushed.
+            return
+        buf = b"".join(self.record_buffer)
+        if not buf:
+            return
+        # We're going to be returning a buffer for the writer to send, so we can clear the internal record buffer.
+        self.record_buffer.clear()
+        return buf
+    def _send(self, data: bytes) -> None:
+        raise RuntimeError("This method should be overridden at runtime")
+    def _send_http(self, data: Optional[bytes] = None, flush: bool = False) -> None:
+        buf = self._cache_records_for_http(data, flush)
+        if not buf:
+            return
+        response = self.session.post(self.url, data=buf)
+        if response.status_code != 200:
+            raise ConnectionError(f"{response.text} ({response.status_code})")
+    def _send_tcp(self, data: bytes) -> None:
+        self.sock.sendall(data)
+    def write(self, record: Record) -> None:
+        if not self._warned and "rdtag" in record._desc.fields:
+            self._warned = True
+            log.warning(
+                "Record has 'rdtag' field which conflicts with the Splunk adapter -- "
+                "Splunk output will have duplicate 'rdtag' fields",
+            )
+        if self.sourcetype == SourceType.RECORDS:
+            rec = splunkify_key_value(record, self.tag)
+        else:
+            rec = splunkify_json(self.packer, record, self.tag)
+        # Trail with a newline for line breaking.
+        data = to_bytes(rec) + b"\n"
+        self._send(data)
+    def flush(self) -> None:
+        if self.protocol in [Protocol.HTTP, Protocol.HTTPS]:
+            self._send_http(flush=True)
+    def close(self) -> None:
+        # For TCP
+        if self.sock:
+            self.sock.close()
+        self.sock = None
+        if self.session:
+            self.flush()
+            self.session.close()
+        self.session = None
+class SplunkReader(AbstractReader):
+    def __init__(self, path, selector=None, **kwargs):
+        raise NotImplementedError()

{flow.record-3.15.dev14 → flow_record-3.15.dev16}/flow/record/fieldtypes/__init__.py RENAMED Viewed

@@ -5,13 +5,14 @@ import math
 import os
 import pathlib
 import re
+import shlex
 import sys
 import warnings
 from binascii import a2b_hex, b2a_hex
 from datetime import datetime as _dt
 from datetime import timezone
 from posixpath import basename, dirname
-from typing import Any, Optional, Tuple
+from typing import Any, Optional
 from urllib.parse import urlparse
 try:
@@ -34,8 +35,8 @@ UTC = timezone.utc
 PY_311 = sys.version_info >= (3, 11, 0)
 PY_312 = sys.version_info >= (3, 12, 0)
-PATH_POSIX = 0
-PATH_WINDOWS = 1
+TYPE_POSIX = 0
+TYPE_WINDOWS = 1
 string_type = str
 varint_type = int
@@ -694,15 +695,15 @@ class path(pathlib.PurePath, FieldType):
         return repr(str(self))
     def _pack(self):
-        path_type = PATH_WINDOWS if isinstance(self, windows_path) else PATH_POSIX
+        path_type = TYPE_WINDOWS if isinstance(self, windows_path) else TYPE_POSIX
         return (str(self), path_type)
     @classmethod
-    def _unpack(cls, data: Tuple[str, str]):
+    def _unpack(cls, data: tuple[str, str]):
         path_, path_type = data
-        if path_type == PATH_POSIX:
+        if path_type == TYPE_POSIX:
             return posix_path(path_)
-        elif path_type == PATH_WINDOWS:
+        elif path_type == TYPE_WINDOWS:
             return windows_path(path_)
         else:
             # Catch all: default to posix_path
@@ -734,3 +735,115 @@ class windows_path(pathlib.PureWindowsPath, path):
                 quote = '"'
         return f"{quote}{s}{quote}"
+class command(FieldType):
+    executable: Optional[path] = None
+    args: Optional[list[str]] = None
+    _path_type: type[path] = None
+    _posix: bool
+    def __new__(cls, value: str) -> command:
+        if cls is not command:
+            return super().__new__(cls)
+        if not isinstance(value, str):
+            raise ValueError(f"Expected a value of type 'str' not {type(value)}")
+        # pre checking for windows like paths
+        # This checks for windows like starts of a path:
+        #   an '%' for an environment variable
+        #   r'\\' for a UNC path
+        #   the strip and check for ":" on the second line is for `<drive_letter>:`
+        windows = value.startswith((r"\\", "%")) or value.lstrip("\"'")[1] == ":"
+        if windows:
+            cls = windows_command
+        else:
+            cls = posix_command
+        return super().__new__(cls)
+    def __init__(self, value: str | tuple[str, tuple[str]] | None):
+        if value is None:
+            return
+        if isinstance(value, str):
+            self.executable, self.args = self._split(value)
+            return
+        executable, self.args = value
+        self.executable = self._path_type(executable)
+        self.args = list(self.args)
+    def __repr__(self) -> str:
+        return f"(executable={self.executable!r}, args={self.args})"
+    def __eq__(self, other: Any) -> bool:
+        if isinstance(other, command):
+            return self.executable == other.executable and self.args == other.args
+        elif isinstance(other, str):
+            return self._join() == other
+        elif isinstance(other, (tuple, list)):
+            return self.executable == other[0] and self.args == list(other[1:])
+        return False
+    def _split(self, value: str) -> tuple[str, list[str]]:
+        executable, *args = shlex.split(value, posix=self._posix)
+        executable = executable.strip("'\" ")
+        return self._path_type(executable), args
+    def _join(self) -> str:
+        return shlex.join([str(self.executable)] + self.args)
+    def _pack(self) -> tuple[tuple[str, list], str]:
+        command_type = TYPE_WINDOWS if isinstance(self, windows_command) else TYPE_POSIX
+        if self.executable:
+            _exec, _ = self.executable._pack()
+            return ((_exec, self.args), command_type)
+        else:
+            return (None, command_type)
+    @classmethod
+    def _unpack(cls, data: tuple[tuple[str, tuple] | None, int]) -> command:
+        _value, _type = data
+        if _type == TYPE_WINDOWS:
+            return windows_command(_value)
+        return posix_command(_value)
+    @classmethod
+    def from_posix(cls, value: str) -> command:
+        return posix_command(value)
+    @classmethod
+    def from_windows(cls, value: str) -> command:
+        return windows_command(value)
+class posix_command(command):
+    _posix = True
+    _path_type = posix_path
+class windows_command(command):
+    _posix = False
+    _path_type = windows_path
+    def _split(self, value: str) -> tuple[str, list[str]]:
+        executable, args = super()._split(value)
+        if args:
+            args = [" ".join(args)]
+        return executable, args
+    def _join(self) -> str:
+        arg = f" {self.args[0]}" if self.args else ""
+        executable_str = str(self.executable)
+        if " " in executable_str:
+            return f"'{executable_str}'{arg}"
+        return f"{executable_str}{arg}"

{flow.record-3.15.dev14 → flow_record-3.15.dev16}/flow/record/jsonpacker.py RENAMED Viewed

@@ -72,6 +72,11 @@ class JsonRecordPacker:
             return base64.b64encode(obj).decode()
         if isinstance(obj, fieldtypes.path):
             return str(obj)
+        if isinstance(obj, fieldtypes.command):
+            return {
+                "executable": obj.executable,
+                "args": obj.args,
+            }
         raise Exception("Unpackable type " + str(type(obj)))

{flow.record-3.15.dev14 → flow_record-3.15.dev16}/flow/record/version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '3.15.dev14'
-__version_tuple__ = version_tuple = (3, 15, 'dev14')
+__version__ = version = '3.15.dev16'
+__version_tuple__ = version_tuple = (3, 15, 'dev16')

{flow.record-3.15.dev14 → flow_record-3.15.dev16}/flow/record/whitelist.py RENAMED Viewed

@@ -1,5 +1,6 @@
 WHITELIST = [
     "boolean",
+    "command",
     "dynamic",
     "datetime",
     "filesize",

{flow.record-3.15.dev14 → flow_record-3.15.dev16/flow.record.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flow.record
-Version: 3.15.dev14
+Version: 3.15.dev16
 Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
 Author-email: Dissect Team <dissect@fox-it.com>
 License: Affero General Public License v3
@@ -35,6 +35,8 @@ Requires-Dist: fastavro[snappy]; extra == "avro"
 Provides-Extra: duckdb
 Requires-Dist: duckdb; extra == "duckdb"
 Requires-Dist: pytz; extra == "duckdb"
+Provides-Extra: splunk
+Requires-Dist: httpx; extra == "splunk"
 Provides-Extra: test
 Requires-Dist: flow.record[compression]; extra == "test"
 Requires-Dist: flow.record[avro]; extra == "test"

{flow.record-3.15.dev14 → flow_record-3.15.dev16}/flow.record.egg-info/requires.txt RENAMED Viewed

@@ -23,6 +23,9 @@ elasticsearch
 [geoip]
 maxminddb
+[splunk]
+httpx
 [test]
 flow.record[compression]
 flow.record[avro]

{flow.record-3.15.dev14 → flow_record-3.15.dev16}/pyproject.toml RENAMED Viewed

@@ -53,6 +53,9 @@ duckdb = [
     "duckdb",
     "pytz", # duckdb requires pytz for timezone support
 ]
+splunk = [
+    "httpx",
+]
 test = [
     "flow.record[compression]",
     "flow.record[avro]",

flow.record 3.15.dev14__tar.gz → 3.15.dev16__tar.gz

flow.record 3.15.dev14tar.gz → 3.15.dev16tar.gz