PyPI - flow.record - Versions diffs - 3.17.dev4__tar.gz → 3.17.dev5__tar.gz - Mend

flow.record 3.17.dev4tar.gz → 3.17.dev5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{flow_record-3.17.dev4/flow.record.egg-info → flow_record-3.17.dev5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flow.record
-Version: 3.17.dev4
+Version: 3.17.dev5
 Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
 Author-email: Dissect Team <dissect@fox-it.com>
 License: Affero General Public License v3

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/adapter/elastic.py RENAMED Viewed

@@ -106,7 +106,7 @@ class ElasticWriter(AbstractWriter):
         }
         if self.hash_record:
-            document["_id"] = hashlib.md5(document["_source"].encode()).hexdigest()
+            document["_id"] = hashlib.md5(document["_source"].encode(errors="surrogateescape")).hexdigest()
         return document

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/adapter/line.py RENAMED Viewed

@@ -69,7 +69,7 @@ class LineWriter(AbstractWriter):
         for key, value in rdict.items():
             if rdict_types:
                 key = f"{key} ({rdict_types[key]})"
-            self.fp.write(fmt.format(key, value).encode())
+            self.fp.write(fmt.format(key, value).encode(errors="surrogateescape"))
     def flush(self) -> None:
         if self.fp:

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/adapter/sqlite.py RENAMED Viewed

@@ -187,7 +187,7 @@ class SqliteReader(AbstractReader):
                         if value == 0:
                             row[idx] = None
                         elif isinstance(value, str):
-                            row[idx] = value.encode("utf-8")
+                            row[idx] = value.encode(errors="surrogateescape")
                 yield descriptor_cls.init_from_dict(dict(zip(fnames, row)))
     def __iter__(self) -> Iterator[Record]:

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/adapter/text.py RENAMED Viewed

@@ -41,7 +41,7 @@ class TextWriter(AbstractWriter):
             buf = self.format_spec.format_map(DefaultMissing(rec._asdict()))
         else:
             buf = repr(rec)
-        self.fp.write(buf.encode() + b"\n")
+        self.fp.write(buf.encode(errors="surrogateescape") + b"\n")
         # because stdout is usually line buffered we force flush here if wanted
         if self.auto_flush:

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/adapter/xlsx.py RENAMED Viewed

@@ -36,7 +36,7 @@ def sanitize_fieldvalues(values: Iterator[Any]) -> Iterator[Any]:
         elif isinstance(value, bytes):
             base64_encode = False
             try:
-                new_value = 'b"' + value.decode() + '"'
+                new_value = 'b"' + value.decode(errors="surrogateescape") + '"'
                 if ILLEGAL_CHARACTERS_RE.search(new_value):
                     base64_encode = True
                 else:
@@ -142,7 +142,7 @@ class XlsxReader(AbstractReader):
                     if field_types[idx] == "bytes":
                         if value[1] == '"':  # If so, we know this is b""
                             # Cut of the b" at the start and the trailing "
-                            value = value[2:-1].encode()
+                            value = value[2:-1].encode(errors="surrogateescape")
                         else:
                             # If not, we know it is base64 encoded (so we cut of the starting 'base64:')
                             value = b64decode(value[7:])

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/base.py RENAMED Viewed

@@ -61,7 +61,7 @@ except ImportError:
 from collections import OrderedDict
-from .utils import to_native_str, to_str
+from .utils import to_str
 from .whitelist import WHITELIST, WHITELIST_TREE
 log = logging.getLogger(__package__)
@@ -513,7 +513,7 @@ class RecordDescriptor:
             name, fields = parse_def(name)
         self.name = name
-        self._field_tuples = tuple([(to_native_str(k), to_str(v)) for k, v in fields])
+        self._field_tuples = tuple([(to_str(k), to_str(v)) for k, v in fields])
         self.recordType = _generate_record_class(name, self._field_tuples)
         self.recordType._desc = self

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/fieldtypes/__init__.py RENAMED Viewed

@@ -28,7 +28,6 @@ except ImportError:
 from flow.record.base import FieldType
 RE_NORMALIZE_PATH = re.compile(r"[\\/]+")
-NATIVE_UNICODE = isinstance("", str)
 UTC = timezone.utc
@@ -207,10 +206,7 @@ class stringlist(list, FieldType):
 class string(string_type, FieldType):
     def __new__(cls, value):
         if isinstance(value, bytes_type):
-            value = cls._decode(value, "utf-8")
-            if isinstance(value, bytes_type):
-                # Still bytes, so decoding failed (Python 2)
-                return bytes(value)
+            value = value.decode(errors="surrogateescape")
         return super().__new__(cls, value)
     def _pack(self):
@@ -221,27 +217,6 @@ class string(string_type, FieldType):
             return defang(self)
         return str.__format__(self, spec)
-    @classmethod
-    def _decode(cls, data, encoding):
-        """Decode a byte-string into a unicode-string.
-        Python 3: When `data` contains invalid unicode characters a `UnicodeDecodeError` is raised.
-        Python 2: When `data` contains invalid unicode characters the original byte-string is returned.
-        """
-        if NATIVE_UNICODE:
-            # Raises exception on decode error
-            return data.decode(encoding)
-        try:
-            return data.decode(encoding)
-        except UnicodeDecodeError:
-            # Fallback to bytes (Python 2 only)
-            preview = data[:16].encode("hex_codec") + (".." if len(data) > 16 else "")
-            warnings.warn(
-                "Got binary data in string field (hex: {}). Compatibility is not guaranteed.".format(preview),
-                RuntimeWarning,
-            )
-            return data
 # Alias for backwards compatibility
 wstring = string
@@ -278,7 +253,7 @@ class datetime(_dt, FieldType):
         if len(args) == 1 and not kwargs:
             arg = args[0]
             if isinstance(arg, bytes_type):
-                arg = arg.decode("utf-8")
+                arg = arg.decode(errors="surrogateescape")
             if isinstance(arg, string_type):
                 # If we are on Python 3.11 or newer, we can use fromisoformat() to parse the string (fast path)
                 #

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/fieldtypes/net/ipv4.py RENAMED Viewed

@@ -3,7 +3,6 @@ import struct
 import warnings
 from flow.record import FieldType
-from flow.record.utils import to_native_str
 def addr_long(s):
@@ -45,9 +44,6 @@ class subnet(FieldType):
             DeprecationWarning,
             stacklevel=5,
         )
-        if isinstance(addr, type("")):
-            addr = to_native_str(addr)
         if not isinstance(addr, str):
             raise TypeError("Subnet() argument 1 must be string, not {}".format(type(addr).__name__))
@@ -67,9 +63,6 @@ class subnet(FieldType):
         if addr is None:
             return False
-        if isinstance(addr, type("")):
-            addr = to_native_str(addr)
         if isinstance(addr, str):
             addr = addr_long(addr)

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/jsonpacker.py RENAMED Viewed

@@ -47,12 +47,8 @@ class JsonRecordPacker:
                 serial["_recorddescriptor"] = obj._desc.identifier
             for field_type, field_name in obj._desc.get_field_tuples():
-                # PYTHON2: Because "bytes" are also "str" we have to handle this here
-                if field_type == "bytes" and isinstance(serial[field_name], str):
-                    serial[field_name] = base64.b64encode(serial[field_name]).decode()
                 # Boolean field types should be cast to a bool instead of staying ints
-                elif field_type == "boolean" and isinstance(serial[field_name], int):
+                if field_type == "boolean" and isinstance(serial[field_name], int):
                     serial[field_name] = bool(serial[field_name])
             return serial

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/utils.py RENAMED Viewed

@@ -3,13 +3,10 @@ from __future__ import annotations
 import base64
 import os
 import sys
+import warnings
 from functools import wraps
 from typing import BinaryIO, TextIO
-_native = str
-_unicode = type("")
-_bytes = type(b"")
 def get_stdout(binary: bool = False) -> TextIO | BinaryIO:
     """Return the stdout stream as binary or text stream.
@@ -50,33 +47,32 @@ def is_stdout(fp: TextIO | BinaryIO) -> bool:
 def to_bytes(value):
     """Convert a value to a byte string."""
-    if value is None or isinstance(value, _bytes):
+    if value is None or isinstance(value, bytes):
         return value
-    if isinstance(value, _unicode):
-        return value.encode("utf-8")
-    return _bytes(value)
+    if isinstance(value, str):
+        return value.encode(errors="surrogateescape")
+    return bytes(value)
 def to_str(value):
     """Convert a value to a unicode string."""
-    if value is None or isinstance(value, _unicode):
+    if value is None or isinstance(value, str):
         return value
-    if isinstance(value, _bytes):
-        return value.decode("utf-8")
-    return _unicode(value)
+    if isinstance(value, bytes):
+        return value.decode(errors="surrogateescape")
+    return str(value)
 def to_native_str(value):
-    """Convert a value to a native `str`."""
-    if value is None or isinstance(value, _native):
-        return value
-    if isinstance(value, _unicode):
-        # Python 2: unicode -> str
-        return value.encode("utf-8")
-    if isinstance(value, _bytes):
-        # Python 3: bytes -> str
-        return value.decode("utf-8")
-    return _native(value)
+    warnings.warn(
+        (
+            "The to_native_str() function is deprecated, "
+            "this function will be removed in flow.record 3.20, "
+            "use to_str() instead"
+        ),
+        DeprecationWarning,
+    )
+    return to_str(value)
 def to_base64(value):

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow/record/version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '3.17.dev4'
-__version_tuple__ = version_tuple = (3, 17, 'dev4')
+__version__ = version = '3.17.dev5'
+__version_tuple__ = version_tuple = (3, 17, 'dev5')

{flow_record-3.17.dev4 → flow_record-3.17.dev5/flow.record.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flow.record
-Version: 3.17.dev4
+Version: 3.17.dev5
 Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
 Author-email: Dissect Team <dissect@fox-it.com>
 License: Affero General Public License v3

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/flow.record.egg-info/SOURCES.txt RENAMED Viewed

@@ -54,6 +54,8 @@ tests/__init__.py
 tests/_utils.py
 tests/selector_explain_example.py
 tests/standalone_test.py
+tests/test_adapter_line.py
+tests/test_adapter_text.py
 tests/test_avro.py
 tests/test_avro_adapter.py
 tests/test_compiled_selector.py
@@ -75,7 +77,6 @@ tests/test_selector.py
 tests/test_splunk_adapter.py
 tests/test_sqlite_duckdb_adapter.py
 tests/test_xlsx_adapter.py
-tests/utils_inspect.py
 tests/docs/Makefile
 tests/docs/conf.py
 tests/docs/index.rst

flow_record-3.17.dev5/tests/test_adapter_line.py ADDED Viewed

@@ -0,0 +1,29 @@
+from io import BytesIO
+from flow.record import RecordDescriptor
+from flow.record.adapter.line import LineWriter
+def test_line_writer_write_surrogateescape():
+    output = BytesIO()
+    lw = LineWriter(
+        path=output,
+        fields="name",
+    )
+    TestRecord = RecordDescriptor(
+        "test/string",
+        [
+            ("string", "name"),
+        ],
+    )
+    # construct from 'bytes' but with invalid unicode bytes
+    record = TestRecord(b"R\xc3\xa9\xeamy")
+    lw.write(record)
+    output.seek(0)
+    data = output.read()
+    assert data == b"--[ RECORD 1 ]--\nname = R\xc3\xa9\xeamy\n"

flow_record-3.17.dev5/tests/test_adapter_text.py ADDED Viewed

@@ -0,0 +1,28 @@
+from io import BytesIO
+from flow.record import RecordDescriptor
+from flow.record.adapter.text import TextWriter
+def test_text_writer_write_surrogateescape():
+    output = BytesIO()
+    tw = TextWriter(
+        path=output,
+    )
+    TestRecord = RecordDescriptor(
+        "test/string",
+        [
+            ("string", "name"),
+        ],
+    )
+    # construct from 'bytes' but with invalid unicode bytes
+    record = TestRecord(b"R\xc3\xa9\xeamy")
+    tw.write(record)
+    output.seek(0)
+    data = output.read()
+    assert data == b"<test/string name='R\xc3\xa9\\udceamy'>\n"

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/tests/test_fieldtypes.py RENAMED Viewed

@@ -213,15 +213,8 @@ def test_string():
     assert r.name == "Rémy"
     # construct from 'bytes' but with invalid unicode bytes
-    if isinstance("", str):
-        # Python 3
-        with pytest.raises(UnicodeDecodeError):
-            TestRecord(b"R\xc3\xa9\xeamy")
-    else:
-        # Python 2
-        with pytest.warns(RuntimeWarning):
-            r = TestRecord(b"R\xc3\xa9\xeamy")
-            assert r.name
+    r = TestRecord(b"R\xc3\xa9\xeamy")
+    assert r.name == "Ré\udceamy"
 def test_wstring():

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/tests/test_json_packer.py RENAMED Viewed

@@ -90,3 +90,23 @@ def test_record_pack_bool_regression() -> None:
     # pack the json string back to a record and make sure it is the same as before
     assert packer.unpack(data) == record
+def test_record_pack_surrogateescape() -> None:
+    TestRecord = RecordDescriptor(
+        "test/string",
+        [
+            ("string", "name"),
+        ],
+    )
+    record = TestRecord(b"R\xc3\xa9\xeamy")
+    packer = JsonRecordPacker()
+    data = packer.pack(record)
+    # pack to json string and check if the 3rd and 4th byte are properly surrogate escaped
+    assert data.startswith('{"name": "R\\u00e9\\udceamy",')
+    # pack the json string back to a record and make sure it is the same as before
+    assert packer.unpack(data) == record

{flow_record-3.17.dev4 → flow_record-3.17.dev5}/tests/test_record.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import importlib
+import inspect
 import os
 import sys
 from unittest.mock import patch
@@ -27,8 +28,6 @@ from flow.record.base import (
 from flow.record.exceptions import RecordDescriptorError
 from flow.record.stream import RecordFieldRewriter
-from . import utils_inspect as inspect
 def test_record_creation():
     TestRecord = RecordDescriptor(
@@ -288,8 +287,30 @@ def test_record_printer_stdout(capsys):
     writer.write(record)
     out, err = capsys.readouterr()
-    modifier = "" if isinstance("", str) else "u"
-    expected = "<test/a a_string={u}'hello' common={u}'world' a_count=10>\n".format(u=modifier)
+    expected = "<test/a a_string='hello' common='world' a_count=10>\n"
+    assert out == expected
+def test_record_printer_stdout_surrogateescape(capsys):
+    Record = RecordDescriptor(
+        "test/a",
+        [
+            ("string", "name"),
+        ],
+    )
+    record = Record(b"R\xc3\xa9\xeamy")
+    # fake capsys to be a tty.
+    def isatty():
+        return True
+    capsys._capture.out.tmpfile.isatty = isatty
+    writer = RecordPrinter(getattr(sys.stdout, "buffer", sys.stdout))
+    writer.write(record)
+    out, err = capsys.readouterr()
+    expected = "<test/a name='Ré\\udceamy'>\n"
     assert out == expected

flow_record-3.17.dev4/tests/utils_inspect.py DELETED Viewed

@@ -1,58 +0,0 @@
-"""
-Backport of `inspect.signature` for Python 2.
-Based on: https://github.com/python/cpython/blob/3.7/Lib/inspect.py
-"""
-import collections
-import inspect
-class _empty:
-    pass
-class Parameter:
-    POSITIONAL_ONLY = 0
-    POSITIONAL_OR_KEYWORD = 1
-    VAR_POSITIONAL = 2
-    KEYWORD_ONLY = 3
-    VAR_KEYWORD = 4
-    empty = _empty
-    def __init__(self, name, kind, default=_empty):
-        self.name = name
-        self.kind = kind
-        self.default = default
-class Signature:
-    empty = _empty
-    def __init__(self, parameters=None):
-        self.parameters = parameters
-def signature(obj):
-    try:
-        # Python 3
-        return inspect.signature(obj)
-    except AttributeError:
-        # Python 2
-        spec = inspect.getargspec(obj)
-        # Create parameter objects which are compatible with python 3 objects
-        parameters = collections.OrderedDict()
-        for i in range(0, len(spec.args)):
-            arg = spec.args[i]
-            default = _empty
-            if spec.defaults and (len(spec.args) - i <= len(spec.defaults)):
-                default = spec.defaults[i - len(spec.args)]
-            parameters[arg] = Parameter(name=arg, default=default, kind=Parameter.POSITIONAL_OR_KEYWORD)
-        if spec.varargs:
-            parameters[spec.varargs] = Parameter(name=spec.varargs, kind=Parameter.VAR_POSITIONAL)
-        if spec.keywords:
-            parameters[spec.keywords] = Parameter(name=spec.keywords, kind=Parameter.VAR_KEYWORD)
-        return Signature(parameters=parameters)