PyPI - flow.record - Versions diffs - 3.12.dev5__tar.gz → 3.13.dev2__tar.gz - Mend

flow.record 3.12.dev5tar.gz → 3.13.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{flow.record-3.12.dev5/flow.record.egg-info → flow.record-3.13.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flow.record
-Version: 3.12.dev5
+Version: 3.13.dev2
 Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
 Author-email: Dissect Team <dissect@fox-it.com>
 License: Affero General Public License v3
@@ -18,13 +18,24 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Utilities
 Requires-Python: ~=3.7
 Description-Content-Type: text/markdown
+License-File: LICENSE
+License-File: COPYRIGHT
+Requires-Dist: msgpack>=0.5.2
+Requires-Dist: backports.zoneinfo[tzdata]; python_version < "3.9"
+Requires-Dist: tzdata; platform_system == "Windows"
 Provides-Extra: compression
+Requires-Dist: lz4; extra == "compression"
+Requires-Dist: zstandard; extra == "compression"
 Provides-Extra: elastic
+Requires-Dist: elasticsearch; extra == "elastic"
 Provides-Extra: geoip
+Requires-Dist: maxminddb; extra == "geoip"
 Provides-Extra: avro
+Requires-Dist: fastavro[snappy]; extra == "avro"
 Provides-Extra: test
-License-File: LICENSE
-License-File: COPYRIGHT
+Requires-Dist: lz4; extra == "test"
+Requires-Dist: zstandard; extra == "test"
+Requires-Dist: fastavro; extra == "test"
 # flow.record

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/__init__.py RENAMED Viewed

@@ -3,6 +3,7 @@ import os
 from flow.record.base import (
     RECORD_VERSION,
+    RECORDSTREAM_MAGIC,
     DynamicDescriptor,
     FieldType,
     GroupedRecord,
@@ -16,7 +17,9 @@ from flow.record.base import (
     dynamic_fieldtype,
     extend_record,
     iter_timestamped_records,
+    open_file,
     open_path,
+    open_stream,
     stream,
 )
 from flow.record.jsonpacker import JsonRecordPacker
@@ -33,6 +36,7 @@ from flow.record.stream import (
 __all__ = [
     "RECORD_VERSION",
+    "RECORDSTREAM_MAGIC",
     "FieldType",
     "Record",
     "GroupedRecord",
@@ -47,7 +51,9 @@ __all__ = [
     "JsonRecordPacker",
     "RecordStreamWriter",
     "RecordStreamReader",
+    "open_file",
     "open_path",
+    "open_stream",
     "stream",
     "dynamic_fieldtype",
     "DynamicDescriptor",

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/avro.py RENAMED Viewed

@@ -1,6 +1,9 @@
+from __future__ import annotations
 import json
 from datetime import datetime, timedelta, timezone
 from importlib.util import find_spec
+from typing import Any, Iterator
 import fastavro
@@ -50,7 +53,7 @@ class AvroWriter(AbstractWriter):
     writer = None
     def __init__(self, path, key=None, **kwargs):
-        self.fp = record.open_path(path, "wb")
+        self.fp = record.open_file(path, "wb")
         self.desc = None
         self.schema = None
@@ -58,7 +61,7 @@ class AvroWriter(AbstractWriter):
         self.writer = None
         self.codec = "snappy" if find_spec("snappy") else "deflate"
-    def write(self, r):
+    def write(self, r: record.Record) -> None:
         if not self.desc:
             self.desc = r._desc
             self.schema = descriptor_to_schema(self.desc)
@@ -79,7 +82,7 @@ class AvroWriter(AbstractWriter):
             )
         self.writer.flush()
-    def close(self):
+    def close(self) -> None:
         if self.fp and not is_stdout(self.fp):
             self.fp.close()
         self.fp = None
@@ -90,7 +93,7 @@ class AvroReader(AbstractReader):
     fp = None
     def __init__(self, path, selector=None, **kwargs):
-        self.fp = record.open_path(path, "rb")
+        self.fp = record.open_file(path, "rb")
         self.selector = make_selector(selector)
         self.reader = fastavro.reader(self.fp)
@@ -105,7 +108,7 @@ class AvroReader(AbstractReader):
             name for name, field in self.desc.get_all_fields().items() if field.typename == "datetime"
         )
-    def __iter__(self):
+    def __iter__(self) -> Iterator[record.Record]:
         for obj in self.reader:
             # Convert timestamp-micros fields back to datetime fields
             for field_name in self.datetime_fields:
@@ -117,13 +120,13 @@ class AvroReader(AbstractReader):
             if not self.selector or self.selector.match(rec):
                 yield rec
-    def close(self):
+    def close(self) -> None:
         if self.fp:
             self.fp.close()
         self.fp = None
-def descriptor_to_schema(desc):
+def descriptor_to_schema(desc: record.RecordDescriptor) -> dict[str, Any]:
     namespace, _, name = desc.name.rpartition("/")
     schema = {
         "type": "record",
@@ -156,7 +159,7 @@ def descriptor_to_schema(desc):
     return schema
-def schema_to_descriptor(schema):
+def schema_to_descriptor(schema: dict) -> record.RecordDescriptor:
     doc = schema.get("doc")
     # Sketchy record descriptor detection
@@ -178,7 +181,7 @@ def schema_to_descriptor(schema):
     return record.RecordDescriptor(name, fields)
-def avro_type_to_flow_type(ftype):
+def avro_type_to_flow_type(ftype: list) -> str:
     ftypes = [ftype] if not isinstance(ftype, list) else ftype
     # If a field can be null, it has an additional type of "null"

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/stream.py RENAMED Viewed

@@ -1,5 +1,8 @@
-from flow import record
+from typing import Iterator, Union
+from flow.record import Record, RecordOutput, RecordStreamReader, open_file, open_path
 from flow.record.adapter import AbstractReader, AbstractWriter
+from flow.record.selector import Selector
 from flow.record.utils import is_stdout
 __usage__ = """
@@ -15,20 +18,20 @@ class StreamWriter(AbstractWriter):
     fp = None
     stream = None
-    def __init__(self, path, clobber=True, **kwargs):
-        self.fp = record.open_path(path, "wb", clobber=clobber)
-        self.stream = record.RecordOutput(self.fp)
+    def __init__(self, path: str, clobber=True, **kwargs):
+        self.fp = open_path(path, "wb", clobber=clobber)
+        self.stream = RecordOutput(self.fp)
-    def write(self, r):
-        self.stream.write(r)
+    def write(self, record: Record) -> None:
+        self.stream.write(record)
-    def flush(self):
+    def flush(self) -> None:
         if self.stream and hasattr(self.stream, "flush"):
             self.stream.flush()
         if self.fp:
             self.fp.flush()
-    def close(self):
+    def close(self) -> None:
         if self.stream:
             self.stream.close()
         self.stream = None
@@ -42,14 +45,14 @@ class StreamReader(AbstractReader):
     fp = None
     stream = None
-    def __init__(self, path, selector=None, **kwargs):
-        self.fp = record.open_path(path, "rb")
-        self.stream = record.RecordStreamReader(self.fp, selector=selector)
+    def __init__(self, path: str, selector: Union[str, Selector] = None, **kwargs):
+        self.fp = open_file(path, "rb")
+        self.stream = RecordStreamReader(self.fp, selector=selector)
-    def __iter__(self):
+    def __iter__(self) -> Iterator[Record]:
         return iter(self.stream)
-    def close(self):
+    def close(self) -> None:
         if self.stream:
             self.stream.close()
         self.stream = None

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/base.py RENAMED Viewed

@@ -14,10 +14,12 @@ import sys
 import warnings
 from datetime import datetime, timezone
 from itertools import zip_longest
-from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple
+from pathlib import Path
+from typing import IO, Any, BinaryIO, Iterator, Mapping, Optional, Sequence, Union
 from urllib.parse import parse_qsl, urlparse
-from .exceptions import RecordDescriptorError
+from flow.record.adapter import AbstractReader, AbstractWriter
+from flow.record.exceptions import RecordAdapterNotFound, RecordDescriptorError
 try:
     import lz4.frame as lz4
@@ -38,6 +40,13 @@ try:
 except ImportError:
     HAS_ZSTD = False
+try:
+    import fastavro as avro  # noqa
+    HAS_AVRO = True
+except ImportError:
+    HAS_AVRO = False
 from collections import OrderedDict
 from .utils import to_native_str, to_str
@@ -63,6 +72,10 @@ GZIP_MAGIC = b"\x1f\x8b"
 BZ2_MAGIC = b"BZh"
 LZ4_MAGIC = b"\x04\x22\x4d\x18"
 ZSTD_MAGIC = b"\x28\xb5\x2f\xfd"
+AVRO_MAGIC = b"Obj"
+RECORDSTREAM_MAGIC = b"RECORDSTREAM\n"
+RECORDSTREAM_MAGIC_DEPTH = 4 + 2 + len(RECORDSTREAM_MAGIC)
 RE_VALID_FIELD_NAME = re.compile(r"^_?[a-zA-Z][a-zA-Z0-9_]*(?:\[\])?$")
 RE_VALID_RECORD_TYPE_NAME = re.compile("^[a-zA-Z][a-zA-Z0-9_]*(/[a-zA-Z][a-zA-Z0-9_]*)*$")
@@ -83,37 +96,6 @@ class {name}(Record):
 """
-class Peekable:
-    """Wrapper class for adding .peek() to a file object."""
-    def __init__(self, fd):
-        self.fd = fd
-        self.buffer = None
-    def peek(self, size):
-        if self.buffer is not None:
-            raise BufferError("Only 1 peek allowed")
-        data = self.fd.read(size)
-        self.buffer = io.BytesIO(data)
-        return data
-    def read(self, size=None):
-        data = b""
-        if self.buffer is None:
-            data = self.fd.read(size)
-        else:
-            data = self.buffer.read(size)
-            if len(data) < size:
-                data += self.fd.read(size - len(data))
-                self.buffer = None
-        return data
-    def close(self):
-        self.buffer = None
-        self.fd.close()
-        self.fd = None
 class FieldType:
     def _typename(self):
         t = type(self)
@@ -339,7 +321,7 @@ class RecordFieldSet(list):
 @functools.lru_cache(maxsize=4096)
-def _generate_record_class(name: str, fields: Tuple[Tuple[str, str]]) -> type:
+def _generate_record_class(name: str, fields: tuple[tuple[str, str]]) -> type:
     """Generate a record class
     Args:
@@ -442,9 +424,9 @@ class RecordDescriptor:
     _desc_hash: int = None
     _fields: Mapping[str, RecordField] = None
     _all_fields: Mapping[str, RecordField] = None
-    _field_tuples: Sequence[Tuple[str, str]] = None
+    _field_tuples: Sequence[tuple[str, str]] = None
-    def __init__(self, name: str, fields: Optional[Sequence[Tuple[str, str]]] = None):
+    def __init__(self, name: str, fields: Optional[Sequence[tuple[str, str]]] = None):
         if not name:
             raise RecordDescriptorError("Record name is required")
@@ -548,7 +530,7 @@ class RecordDescriptor:
         """Create a new Record initialized with `args` and `kwargs`."""
         return self.recordType(*args, **kwargs)
-    def init_from_dict(self, rdict: Dict[str, Any], raise_unknown=False) -> Record:
+    def init_from_dict(self, rdict: dict[str, Any], raise_unknown=False) -> Record:
         """Create a new Record initialized with key, value pairs from `rdict`.
         If `raise_unknown=True` then fields on `rdict` that are unknown to this
@@ -575,7 +557,7 @@ class RecordDescriptor:
         """
         return self.init_from_dict(record._asdict(), raise_unknown=raise_unknown)
-    def extend(self, fields: Sequence[Tuple[str, str]]) -> RecordDescriptor:
+    def extend(self, fields: Sequence[tuple[str, str]]) -> RecordDescriptor:
         """Returns a new RecordDescriptor with the extended fields
         Returns:
@@ -584,7 +566,7 @@ class RecordDescriptor:
         new_fields = list(self.get_field_tuples()) + fields
         return RecordDescriptor(self.name, new_fields)
-    def get_field_tuples(self) -> Tuple[Tuple[str, str]]:
+    def get_field_tuples(self) -> tuple[tuple[str, str]]:
         """Returns a tuple containing the (typename, name) tuples, eg:
         (('boolean', 'foo'), ('string', 'bar'))
@@ -596,7 +578,7 @@ class RecordDescriptor:
     @staticmethod
     @functools.lru_cache(maxsize=256)
-    def calc_descriptor_hash(name, fields: Sequence[Tuple[str, str]]) -> int:
+    def calc_descriptor_hash(name, fields: Sequence[tuple[str, str]]) -> int:
         """Calculate and return the (cached) descriptor hash as a 32 bit integer.
         The descriptor hash is the first 4 bytes of the sha256sum of the descriptor name and field names and types.
@@ -612,7 +594,7 @@ class RecordDescriptor:
         return self._desc_hash
     @property
-    def identifier(self) -> Tuple[str, int]:
+    def identifier(self) -> tuple[str, int]:
         """Returns a tuple containing the descriptor name and hash"""
         return (self.name, self.descriptor_hash)
@@ -650,11 +632,11 @@ class RecordDescriptor:
         return wrapper
-    def _pack(self) -> Tuple[str, Tuple[Tuple[str, str]]]:
+    def _pack(self) -> tuple[str, tuple[tuple[str, str]]]:
         return (self.name, self._field_tuples)
     @staticmethod
-    def _unpack(name, fields: Tuple[Tuple[str, str]]) -> RecordDescriptor:
+    def _unpack(name, fields: tuple[tuple[str, str]]) -> RecordDescriptor:
         return RecordDescriptor(name, fields)
@@ -662,17 +644,66 @@ def DynamicDescriptor(name, fields):
     return RecordDescriptor(name, [("dynamic", field) for field in fields])
-def open_path(path, mode, clobber=True):
+def open_stream(fp: BinaryIO, mode: str) -> BinaryIO:
+    if not hasattr(fp, "peek"):
+        fp = io.BufferedReader(fp)
+    # We peek into the file at the maximum possible length we might need, which is the amount of bytes needed to
+    # determine whether a stream is a RECORDSTREAM or not.
+    peek_data = fp.peek(RECORDSTREAM_MAGIC_DEPTH)
+    # If the data stream is compressed, we wrap the file pointer in a reader that can decompress accordingly.
+    if peek_data[:2] == GZIP_MAGIC:
+        fp = gzip.GzipFile(fileobj=fp, mode=mode)
+    elif HAS_BZ2 and peek_data[:3] == BZ2_MAGIC:
+        fp = bz2.BZ2File(fp, mode=mode)
+    elif HAS_LZ4 and peek_data[:4] == LZ4_MAGIC:
+        fp = lz4.open(fp, mode=mode)
+    elif HAS_ZSTD and peek_data[:4] == ZSTD_MAGIC:
+        dctx = zstd.ZstdDecompressor()
+        fp = dctx.stream_reader(fp)
+    return fp
+def find_adapter_for_stream(fp: BinaryIO) -> tuple[BinaryIO, Optional[str]]:
+    # We need to peek into the stream to be able to determine which adapter is needed. The fp given to this function
+    # might already be an instance of the 'Peekable' class, but might also be a different file pointer, for example
+    # a transparent decompressor. As calling peek() twice on the same peekable is not allowed, we wrap the fp into
+    # a Peekable again, so that we are able to determine the correct adapter.
+    if not hasattr(fp, "peek"):
+        fp = io.BufferedReader(fp)
+    peek_data = fp.peek(RECORDSTREAM_MAGIC_DEPTH)
+    if HAS_AVRO and peek_data[:3] == AVRO_MAGIC:
+        return fp, "avro"
+    elif RECORDSTREAM_MAGIC in peek_data[:RECORDSTREAM_MAGIC_DEPTH]:
+        return fp, "stream"
+    return fp, None
+def open_file(path: Union[str, Path, BinaryIO], mode: str, clobber: bool = True) -> IO:
+    if isinstance(path, Path):
+        path = str(path)
+    if isinstance(path, str):
+        return open_path(path, mode, clobber)
+    elif isinstance(path, io.IOBase):
+        return open_stream(path, "rb")
+    else:
+        raise ValueError(f"Unsupported path type {path}")
+def open_path(path: str, mode: str, clobber: bool = True) -> IO:
     """
-    Open `path` using `mode` and returns a file object.
+    Open ``path`` using ``mode`` and returns a file object.
     It handles special cases if path is meant to be stdin or stdout.
     And also supports compression based on extension or file header of stream.
     Args:
-        path (str): Filename or path to filename to open
-        mode (str): Could be "r", "rb" to open file for reading, "w", "wb" for writing
-        clobber (bool): Overwrite file if it already exists if `clobber=True`, else raises IOError.
+        path: Filename or path to filename to open
+        mode: Could be "r", "rb" to open file for reading, "w", "wb" for writing
+        clobber: Overwrite file if it already exists if `clobber=True`, else raises IOError.
     """
     binary = "b" in mode
@@ -724,24 +755,18 @@ def open_path(path, mode, clobber=True):
             fp = io.open(path, mode)
         # check if we are reading a compressed stream
         if not out and binary:
-            if not hasattr(fp, "peek"):
-                fp = Peekable(fp)
-            peek_data = fp.peek(4)
-            if peek_data[:2] == GZIP_MAGIC:
-                fp = gzip.GzipFile(fileobj=fp, mode=mode)
-            elif HAS_BZ2 and peek_data[:3] == BZ2_MAGIC:
-                fp = bz2.BZ2File(fp, mode=mode)
-            elif HAS_LZ4 and peek_data[:4] == LZ4_MAGIC:
-                fp = lz4.open(fp, mode=mode)
-            elif HAS_ZSTD and peek_data[:4] == ZSTD_MAGIC:
-                dctx = zstd.ZstdDecompressor()
-                fp = dctx.stream_reader(fp)
+            fp = open_stream(fp, mode)
     return fp
-def RecordAdapter(url, out, selector=None, clobber=True, **kwargs):
-    url = str(url or "")
+def RecordAdapter(
+    url: Optional[str] = None,
+    out: bool = False,
+    selector: Optional[str] = None,
+    clobber: bool = True,
+    fileobj: Optional[BinaryIO] = None,
+    **kwargs,
+) -> Union[AbstractWriter, AbstractReader]:
     # Guess adapter based on extension
     ext_to_adapter = {
         ".avro": "avro",
@@ -749,42 +774,94 @@ def RecordAdapter(url, out, selector=None, clobber=True, **kwargs):
         ".jsonl": "jsonfile",
         ".csv": "csvfile",
     }
-    _, ext = os.path.splitext(url)
-    adapter_scheme = ext_to_adapter.get(ext, "stream")
-    if "://" not in url:
-        url = f"{adapter_scheme}://{url}"
-    p = urlparse(url, scheme=adapter_scheme)
-    adapter, _, sub_adapter = p.scheme.partition("+")
+    cls_stream = None
+    cls_url = None
+    adapter = None
+    # When a url is given, we interpret it to determine what kind of adapter we need. This piece of logic is always
+    # necessary for the RecordWriter (as it does not currently support file-like objects), and only needed for
+    # RecordReader if a url is provided.
+    if out is True or url not in ("-", "", None):
+        # Either stdout / stdin is given, or a path-like string.
+        url = str(url or "")
+        _, ext = os.path.splitext(url)
+        adapter_scheme = ext_to_adapter.get(ext, "stream")
+        if "://" not in url:
+            url = f"{adapter_scheme}://{url}"
+        p = urlparse(url, scheme=adapter_scheme)
+        adapter, _, sub_adapter = p.scheme.partition("+")
+        arg_dict = dict(parse_qsl(p.query))
+        arg_dict.update(kwargs)
+        cls_url = p.netloc + p.path
+        if sub_adapter:
+            cls_url = sub_adapter + "://" + cls_url
+    elif url in ("-", ""):
+        # For reading stdin, we cannot rely on an extension to know what sort of stream is incoming. Thus, we will treat
+        # it as a 'fileobj', where we can peek into the stream and try to select the appropriate adapter.
+        fileobj = getattr(sys.stdin, "buffer", sys.stdin)
+    if fileobj is not None:
+        # This record adapter has received a file-like object for record reading
+        # We just need to find the right adapter by peeking into the first few bytes.
+        # First, we open the stream. If the stream is compressed, open_stream will wrap it for us into a decompressor.
+        cls_stream = open_stream(fileobj, "rb")
+        # Now, we have a stream that will be transparently decompressed but we still do not know what adapter to use.
+        # This requires a new peek into the transparent stream. This peek will cause the stream pointer to be moved.
+        # Therefore, find_adapter_for_stream returns both a BinaryIO-supportive object that can correctly read the
+        # adjusted stream, and a string indicating the type of adapter to be used on said stream.
+        arg_dict = kwargs.copy()
+        # If a user did not provide a url, we have to peek into the stream to be able to determine the right adapter
+        # based on magic bytes encountered in the first few bytes of the stream.
+        if adapter is None:
+            cls_stream, adapter = find_adapter_for_stream(cls_stream)
+            if adapter is None:
+                peek_data = cls_stream.peek(RECORDSTREAM_MAGIC_DEPTH)
+                if peek_data and peek_data.startswith(b"<"):
+                    # As peek() can result in a larger buffer than requested, we make sure the peek_data variable isn't
+                    # unnecessarily long in the error message.
+                    peek_data = peek_data[:RECORDSTREAM_MAGIC_DEPTH]
+                    raise RecordAdapterNotFound(
+                        (
+                            f"Could not find a reader for input {peek_data!r}. Are you perhaps "
+                            "entering record text, rather than a record stream? This can be fixed by using "
+                            "'rdump -w -' to write a record stream to stdout."
+                        )
+                    )
+                raise RecordAdapterNotFound("Could not find adapter for file-like object")
+    # Now that we know which adapter is needed, we import it.
     mod = importlib.import_module("flow.record.adapter.{}".format(adapter))
     clsname = ("{}Writer" if out else "{}Reader").format(adapter.title())
     cls = getattr(mod, clsname)
-    arg_dict = dict(parse_qsl(p.query))
-    arg_dict.update(kwargs)
-    cls_url = p.netloc + p.path
-    if sub_adapter:
-        cls_url = sub_adapter + "://" + cls_url
     if not out and selector:
         arg_dict["selector"] = selector
     if out:
         arg_dict["clobber"] = clobber
     log.debug("Creating {!r} for {!r} with args {!r}".format(cls, url, arg_dict))
+    if fileobj is not None:
+        return cls(cls_stream, **arg_dict)
     return cls(cls_url, **arg_dict)
-def RecordReader(url=None, selector=None, **kwargs):
-    return RecordAdapter(url, False, selector=selector, **kwargs)
+def RecordReader(
+    url: Optional[str] = None,
+    selector: Optional[str] = None,
+    fileobj: Optional[BinaryIO] = None,
+    **kwargs,
+) -> AbstractReader:
+    return RecordAdapter(url=url, out=False, selector=selector, fileobj=fileobj, **kwargs)
-def RecordWriter(url=None, clobber=True, **kwargs):
-    return RecordAdapter(url, True, clobber=clobber, **kwargs)
+def RecordWriter(url: Optional[str] = None, clobber: bool = True, **kwargs) -> AbstractWriter:
+    return RecordAdapter(url=url, out=True, clobber=clobber, **kwargs)
 def stream(src, dst):
@@ -834,7 +911,7 @@ def fieldtype(clspath: str) -> FieldType:
 @functools.lru_cache(maxsize=4069)
 def merge_record_descriptors(
-    descriptors: Tuple[RecordDescriptor], replace: bool = False, name: Optional[str] = None
+    descriptors: tuple[RecordDescriptor], replace: bool = False, name: Optional[str] = None
 ) -> RecordDescriptor:
     """Create a newly merged RecordDescriptor from a list of RecordDescriptors.
     This function uses a cache to avoid creating the same descriptor multiple times.
@@ -861,7 +938,7 @@ def merge_record_descriptors(
 def extend_record(
-    record: Record, other_records: List[Record], replace: bool = False, name: Optional[str] = None
+    record: Record, other_records: list[Record], replace: bool = False, name: Optional[str] = None
 ) -> Record:
     """Extend ``record`` with fields and values from ``other_records``.

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/exceptions.py RENAMED Viewed

@@ -4,3 +4,7 @@ class RecordDescriptorError(Exception):
 class RecordDescriptorNotFound(Exception):
     """The specified record descriptor could not be found"""
+class RecordAdapterNotFound(Exception):
+    """Could not find a fitting RecordAdapter for a given input"""

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/fieldtypes/__init__.py RENAMED Viewed

@@ -15,9 +15,14 @@ from typing import Any, Optional, Tuple
 from urllib.parse import urlparse
 try:
-    from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
+    try:
+        from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
+    except ImportError:
+        from backports.zoneinfo import ZoneInfo, ZoneInfoNotFoundError
+    HAS_ZONE_INFO = True
 except ImportError:
-    from backports.zoneinfo import ZoneInfo, ZoneInfoNotFoundError
+    HAS_ZONE_INFO = False
 from flow.record.base import FieldType
@@ -50,9 +55,16 @@ def flow_record_tz(*, default_tz: str = "UTC") -> Optional[ZoneInfo | UTC]:
     Returns:
         None if ``FLOW_RECORD_TZ=NONE`` otherwise ``ZoneInfo(FLOW_RECORD_TZ)`` or ``UTC`` if ZoneInfo is not found.
     """
     tz = os.environ.get("FLOW_RECORD_TZ", default_tz)
     if tz.upper() == "NONE":
         return None
+    if not HAS_ZONE_INFO:
+        if tz != "UTC":
+            warnings.warn("Cannot use FLOW_RECORD_TZ due to missing zoneinfo module, defaulting to 'UTC'.")
+        return UTC
     try:
         return ZoneInfo(tz)
     except ZoneInfoNotFoundError as exc:

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/stream.py RENAMED Viewed

@@ -8,7 +8,7 @@ import sys
 from collections import ChainMap
 from functools import lru_cache
-from flow.record import RecordWriter
+from flow.record import RECORDSTREAM_MAGIC, RecordWriter
 from flow.record.fieldtypes import fieldtype_for_value
 from flow.record.selector import make_selector
@@ -17,8 +17,6 @@ from .packer import RecordPacker
 log = logging.getLogger(__package__)
-RECORDSTREAM_MAGIC = b"RECORDSTREAM\n"
 def RecordOutput(fp):
     """Return a RecordPrinter if `fp` is a tty otherwise a RecordStreamWriter."""

flow.record-3.13.dev2/flow/record/version.py ADDED Viewed

@@ -0,0 +1,16 @@
+# file generated by setuptools_scm
+# don't change, don't track in version control
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple, Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '3.13.dev2'
+__version_tuple__ = version_tuple = (3, 13, 'dev2')

{flow.record-3.12.dev5 → flow.record-3.13.dev2/flow.record.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flow.record
-Version: 3.12.dev5
+Version: 3.13.dev2
 Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
 Author-email: Dissect Team <dissect@fox-it.com>
 License: Affero General Public License v3
@@ -18,13 +18,24 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Utilities
 Requires-Python: ~=3.7
 Description-Content-Type: text/markdown
+License-File: LICENSE
+License-File: COPYRIGHT
+Requires-Dist: msgpack>=0.5.2
+Requires-Dist: backports.zoneinfo[tzdata]; python_version < "3.9"
+Requires-Dist: tzdata; platform_system == "Windows"
 Provides-Extra: compression
+Requires-Dist: lz4; extra == "compression"
+Requires-Dist: zstandard; extra == "compression"
 Provides-Extra: elastic
+Requires-Dist: elasticsearch; extra == "elastic"
 Provides-Extra: geoip
+Requires-Dist: maxminddb; extra == "geoip"
 Provides-Extra: avro
+Requires-Dist: fastavro[snappy]; extra == "avro"
 Provides-Extra: test
-License-File: LICENSE
-License-File: COPYRIGHT
+Requires-Dist: lz4; extra == "test"
+Requires-Dist: zstandard; extra == "test"
+Requires-Dist: fastavro; extra == "test"
 # flow.record

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow.record.egg-info/SOURCES.txt RENAMED Viewed

@@ -52,6 +52,7 @@ tests/__init__.py
 tests/_utils.py
 tests/selector_explain_example.py
 tests/standalone_test.py
+tests/test_avro.py
 tests/test_avro_adapter.py
 tests/test_compiled_selector.py
 tests/test_deprecations.py

flow.record-3.13.dev2/tests/test_avro.py ADDED Viewed

@@ -0,0 +1,64 @@
+from io import BytesIO
+import pytest
+from flow.record import RecordDescriptor, RecordReader
+from flow.record.adapter.avro import AvroReader, AvroWriter
+from flow.record.base import HAS_AVRO
+def generate_records(amount):
+    TestRecordWithFooBar = RecordDescriptor(
+        "test/record",
+        [
+            ("string", "name"),
+            ("string", "foo"),
+            ("string", "bar"),
+        ],
+    )
+    for i in range(amount):
+        yield TestRecordWithFooBar(name=f"record{i}", foo="bar", bar="baz")
+def test_writing_reading_avrofile(tmp_path):
+    if not HAS_AVRO:
+        pytest.skip("fastavro module not installed")
+    avro_path = tmp_path / "test.avro"
+    out = AvroWriter(avro_path)
+    for rec in generate_records(100):
+        out.write(rec)
+    out.close()
+    reader = AvroReader(avro_path)
+    for index, rec in enumerate(reader):
+        assert rec.name == f"record{index}"
+        assert rec.foo == "bar"
+        assert rec.bar == "baz"
+def test_avrostream_filelike_object(tmp_path):
+    if not HAS_AVRO:
+        pytest.skip("fastavro module not installed")
+    avro_path = tmp_path / "test.avro"
+    out = AvroWriter(avro_path)
+    for rec in generate_records(100):
+        out.write(rec)
+    out.close()
+    with open(avro_path, "rb") as avro_file:
+        avro_buffer = avro_file.read()
+    avro_io = BytesIO(avro_buffer)
+    reader = RecordReader(fileobj=avro_io)
+    #  The record reader should automatically have created an 'AvroReader' to handle the Avro Record Stream
+    assert isinstance(reader, AvroReader)
+    # Verify if selector worked and records are the same
+    for index, rec in enumerate(reader):
+        assert rec.name == f"record{index}"
+        assert rec.foo == "bar"
+        assert rec.bar == "baz"

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_rdump.py RENAMED Viewed

@@ -67,7 +67,7 @@ def test_rdump_pipe(tmp_path):
     )
     stdout, stderr = p2.communicate()
     assert stdout.strip() == b""
-    assert b"Unknown file format, not a RecordStream" in stderr.strip()
+    assert b"Are you perhaps entering record text, rather than a record stream?" in stderr.strip()
     # rdump test.records -w - | rdump -s 'r.count in (1, 3, 9)' -w filtered.records
     path2 = tmp_path / "filtered.records"
@@ -461,6 +461,43 @@ def test_rdump_headerless_csv(tmp_path, capsysbinary):
     ]
+def test_rdump_stdin_peek(tmp_path):
+    if platform.system() == "Windows":
+        pytest.skip("No Gzip on Windows")
+    TestRecord = RecordDescriptor(
+        "test/record",
+        [
+            ("varint", "count"),
+            ("string", "foo"),
+        ],
+    )
+    path = tmp_path / "test.records"
+    writer = RecordWriter(path)
+    # generate some test records
+    for i in range(10):
+        writer.write(TestRecord(count=i, foo="bar"))
+    writer.close()
+    # Gzip compress records file
+    compress_cmd = ["gzip", "--keep", str(path)]
+    subprocess.check_output(compress_cmd)
+    compressed_path = str(path) + ".gz"
+    # Rdump should transparently decompress and select the correct adapter
+    p1 = subprocess.Popen(["cat", compressed_path], stdout=subprocess.PIPE)
+    p2 = subprocess.Popen(
+        ["rdump", "-s", "r.count == 5"],
+        stdin=p1.stdout,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    stdout, _ = p2.communicate()
+    assert stdout.strip() in (b"<test/record count=5 foo='bar'>", b"<test/record count=5L foo=u'bar'>")
 @pytest.mark.parametrize(
     ("total_records", "count", "skip", "expected_numbers"),
     [

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_record_adapter.py RENAMED Viewed

@@ -18,6 +18,7 @@ from flow.record import (
     RecordStreamReader,
     RecordWriter,
 )
+from flow.record.adapter.stream import StreamReader
 from flow.record.base import (
     BZ2_MAGIC,
     GZIP_MAGIC,
@@ -46,6 +47,26 @@ def test_stream_writer_reader():
     assert set([2, 7]) == set([r.number for r in records])
+def test_recordstream_filelike_object():
+    fp = StringIO()
+    out = RecordOutput(fp)
+    for rec in generate_records():
+        out.write(rec)
+    fp.seek(0)
+    reader = RecordReader(fileobj=fp, selector="r.number in (6, 9)")
+    #  The record reader should automatically have created a 'StreamReader' to handle the Record Stream.
+    assert isinstance(reader, StreamReader)
+    # Verify if selector worked and records are the same
+    records = []
+    for rec in reader:
+        records.append(rec)
+    assert set([6, 9]) == set([r.number for r in records])
 @pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
 def test_file_writer_reader(tmpdir, PSelector):
     p = tmpdir.join("test.records")
@@ -104,6 +125,15 @@ def test_compressed_writer_reader(tmpdir, compression):
     assert numbers == list(range(count))
+    # Using a file-handle instead of a path should also work
+    with open(path, "rb") as fh:
+        reader = RecordReader(fileobj=fh)
+        numbers = []
+        for rec in reader:
+            numbers.append(rec.number)
+        assert numbers == list(range(count))
 def test_path_template_writer(tmpdir):
     TestRecord = RecordDescriptor(

{flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_regression.py RENAMED Viewed

@@ -5,7 +5,8 @@ import pathlib
 import subprocess
 import sys
 from datetime import datetime, timezone
-from unittest.mock import mock_open, patch
+from io import BytesIO
+from unittest.mock import MagicMock, mock_open, patch
 import msgpack
 import pytest
@@ -589,7 +590,11 @@ def test_record_adapter_windows_path(tmp_path):
         writer.write(TestRecord("foo"))
         writer.write(TestRecord("bar"))
-    with patch("io.open", mock_open(read_data=path_records.read_bytes())) as m:
+    test_read_buf = BytesIO(path_records.read_bytes())
+    mock_reader = MagicMock(wraps=test_read_buf, spec=BytesIO)
+    with patch("io.open", MagicMock(return_value=mock_reader)) as m:
+        m.return_value.closed = False
         adapter = RecordReader(r"c:\users\user\test.records")
         assert type(adapter).__name__ == "StreamReader"
         m.assert_called_once_with(r"c:\users\user\test.records", "rb")

flow.record-3.12.dev5/flow/record/version.py DELETED Viewed

@@ -1,4 +0,0 @@
-# file generated by setuptools_scm
-# don't change, don't track in version control
-__version__ = version = '3.12.dev5'
-__version_tuple__ = version_tuple = (3, 12, 'dev5')