flow.record 3.12.dev5__tar.gz → 3.13.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flow.record-3.12.dev5/flow.record.egg-info → flow.record-3.13.dev2}/PKG-INFO +14 -3
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/__init__.py +6 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/avro.py +12 -9
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/stream.py +16 -13
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/base.py +163 -86
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/exceptions.py +4 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/fieldtypes/__init__.py +14 -2
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/stream.py +1 -3
- flow.record-3.13.dev2/flow/record/version.py +16 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2/flow.record.egg-info}/PKG-INFO +14 -3
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow.record.egg-info/SOURCES.txt +1 -0
- flow.record-3.13.dev2/tests/test_avro.py +64 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_rdump.py +38 -1
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_record_adapter.py +30 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_regression.py +7 -2
- flow.record-3.12.dev5/flow/record/version.py +0 -4
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/COPYRIGHT +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/LICENSE +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/MANIFEST.in +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/README.md +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/examples/filesystem.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/examples/passivedns.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/examples/records.json +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/examples/tcpconn.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/__init__.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/archive.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/broker.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/csvfile.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/elastic.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/jsonfile.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/line.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/mongo.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/split.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/splunk.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/text.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/adapter/xlsx.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/fieldtypes/credential.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/fieldtypes/net/__init__.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/fieldtypes/net/ip.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/fieldtypes/net/ipv4.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/fieldtypes/net/tcp.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/fieldtypes/net/udp.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/jsonpacker.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/packer.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/selector.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/tools/__init__.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/tools/geoip.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/tools/rdump.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/utils.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow/record/whitelist.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow.record.egg-info/dependency_links.txt +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow.record.egg-info/entry_points.txt +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow.record.egg-info/requires.txt +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/flow.record.egg-info/top_level.txt +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/pyproject.toml +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/setup.cfg +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/__init__.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/_utils.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/docs/Makefile +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/docs/conf.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/docs/index.rst +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/selector_explain_example.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/standalone_test.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_avro_adapter.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_compiled_selector.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_deprecations.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_fieldtype_ip.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_fieldtypes.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_json_packer.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_json_record_adapter.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_multi_timestamp.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_packer.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_record.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_record_descriptor.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_selector.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/test_splunk_adapter.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tests/utils_inspect.py +0 -0
- {flow.record-3.12.dev5 → flow.record-3.13.dev2}/tox.ini +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flow.record
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.13.dev2
|
|
4
4
|
Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
|
|
5
5
|
Author-email: Dissect Team <dissect@fox-it.com>
|
|
6
6
|
License: Affero General Public License v3
|
|
@@ -18,13 +18,24 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
18
18
|
Classifier: Topic :: Utilities
|
|
19
19
|
Requires-Python: ~=3.7
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
License-File: COPYRIGHT
|
|
23
|
+
Requires-Dist: msgpack>=0.5.2
|
|
24
|
+
Requires-Dist: backports.zoneinfo[tzdata]; python_version < "3.9"
|
|
25
|
+
Requires-Dist: tzdata; platform_system == "Windows"
|
|
21
26
|
Provides-Extra: compression
|
|
27
|
+
Requires-Dist: lz4; extra == "compression"
|
|
28
|
+
Requires-Dist: zstandard; extra == "compression"
|
|
22
29
|
Provides-Extra: elastic
|
|
30
|
+
Requires-Dist: elasticsearch; extra == "elastic"
|
|
23
31
|
Provides-Extra: geoip
|
|
32
|
+
Requires-Dist: maxminddb; extra == "geoip"
|
|
24
33
|
Provides-Extra: avro
|
|
34
|
+
Requires-Dist: fastavro[snappy]; extra == "avro"
|
|
25
35
|
Provides-Extra: test
|
|
26
|
-
|
|
27
|
-
|
|
36
|
+
Requires-Dist: lz4; extra == "test"
|
|
37
|
+
Requires-Dist: zstandard; extra == "test"
|
|
38
|
+
Requires-Dist: fastavro; extra == "test"
|
|
28
39
|
|
|
29
40
|
# flow.record
|
|
30
41
|
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
|
|
4
4
|
from flow.record.base import (
|
|
5
5
|
RECORD_VERSION,
|
|
6
|
+
RECORDSTREAM_MAGIC,
|
|
6
7
|
DynamicDescriptor,
|
|
7
8
|
FieldType,
|
|
8
9
|
GroupedRecord,
|
|
@@ -16,7 +17,9 @@ from flow.record.base import (
|
|
|
16
17
|
dynamic_fieldtype,
|
|
17
18
|
extend_record,
|
|
18
19
|
iter_timestamped_records,
|
|
20
|
+
open_file,
|
|
19
21
|
open_path,
|
|
22
|
+
open_stream,
|
|
20
23
|
stream,
|
|
21
24
|
)
|
|
22
25
|
from flow.record.jsonpacker import JsonRecordPacker
|
|
@@ -33,6 +36,7 @@ from flow.record.stream import (
|
|
|
33
36
|
|
|
34
37
|
__all__ = [
|
|
35
38
|
"RECORD_VERSION",
|
|
39
|
+
"RECORDSTREAM_MAGIC",
|
|
36
40
|
"FieldType",
|
|
37
41
|
"Record",
|
|
38
42
|
"GroupedRecord",
|
|
@@ -47,7 +51,9 @@ __all__ = [
|
|
|
47
51
|
"JsonRecordPacker",
|
|
48
52
|
"RecordStreamWriter",
|
|
49
53
|
"RecordStreamReader",
|
|
54
|
+
"open_file",
|
|
50
55
|
"open_path",
|
|
56
|
+
"open_stream",
|
|
51
57
|
"stream",
|
|
52
58
|
"dynamic_fieldtype",
|
|
53
59
|
"DynamicDescriptor",
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
from datetime import datetime, timedelta, timezone
|
|
3
5
|
from importlib.util import find_spec
|
|
6
|
+
from typing import Any, Iterator
|
|
4
7
|
|
|
5
8
|
import fastavro
|
|
6
9
|
|
|
@@ -50,7 +53,7 @@ class AvroWriter(AbstractWriter):
|
|
|
50
53
|
writer = None
|
|
51
54
|
|
|
52
55
|
def __init__(self, path, key=None, **kwargs):
|
|
53
|
-
self.fp = record.
|
|
56
|
+
self.fp = record.open_file(path, "wb")
|
|
54
57
|
|
|
55
58
|
self.desc = None
|
|
56
59
|
self.schema = None
|
|
@@ -58,7 +61,7 @@ class AvroWriter(AbstractWriter):
|
|
|
58
61
|
self.writer = None
|
|
59
62
|
self.codec = "snappy" if find_spec("snappy") else "deflate"
|
|
60
63
|
|
|
61
|
-
def write(self, r):
|
|
64
|
+
def write(self, r: record.Record) -> None:
|
|
62
65
|
if not self.desc:
|
|
63
66
|
self.desc = r._desc
|
|
64
67
|
self.schema = descriptor_to_schema(self.desc)
|
|
@@ -79,7 +82,7 @@ class AvroWriter(AbstractWriter):
|
|
|
79
82
|
)
|
|
80
83
|
self.writer.flush()
|
|
81
84
|
|
|
82
|
-
def close(self):
|
|
85
|
+
def close(self) -> None:
|
|
83
86
|
if self.fp and not is_stdout(self.fp):
|
|
84
87
|
self.fp.close()
|
|
85
88
|
self.fp = None
|
|
@@ -90,7 +93,7 @@ class AvroReader(AbstractReader):
|
|
|
90
93
|
fp = None
|
|
91
94
|
|
|
92
95
|
def __init__(self, path, selector=None, **kwargs):
|
|
93
|
-
self.fp = record.
|
|
96
|
+
self.fp = record.open_file(path, "rb")
|
|
94
97
|
self.selector = make_selector(selector)
|
|
95
98
|
|
|
96
99
|
self.reader = fastavro.reader(self.fp)
|
|
@@ -105,7 +108,7 @@ class AvroReader(AbstractReader):
|
|
|
105
108
|
name for name, field in self.desc.get_all_fields().items() if field.typename == "datetime"
|
|
106
109
|
)
|
|
107
110
|
|
|
108
|
-
def __iter__(self):
|
|
111
|
+
def __iter__(self) -> Iterator[record.Record]:
|
|
109
112
|
for obj in self.reader:
|
|
110
113
|
# Convert timestamp-micros fields back to datetime fields
|
|
111
114
|
for field_name in self.datetime_fields:
|
|
@@ -117,13 +120,13 @@ class AvroReader(AbstractReader):
|
|
|
117
120
|
if not self.selector or self.selector.match(rec):
|
|
118
121
|
yield rec
|
|
119
122
|
|
|
120
|
-
def close(self):
|
|
123
|
+
def close(self) -> None:
|
|
121
124
|
if self.fp:
|
|
122
125
|
self.fp.close()
|
|
123
126
|
self.fp = None
|
|
124
127
|
|
|
125
128
|
|
|
126
|
-
def descriptor_to_schema(desc):
|
|
129
|
+
def descriptor_to_schema(desc: record.RecordDescriptor) -> dict[str, Any]:
|
|
127
130
|
namespace, _, name = desc.name.rpartition("/")
|
|
128
131
|
schema = {
|
|
129
132
|
"type": "record",
|
|
@@ -156,7 +159,7 @@ def descriptor_to_schema(desc):
|
|
|
156
159
|
return schema
|
|
157
160
|
|
|
158
161
|
|
|
159
|
-
def schema_to_descriptor(schema):
|
|
162
|
+
def schema_to_descriptor(schema: dict) -> record.RecordDescriptor:
|
|
160
163
|
doc = schema.get("doc")
|
|
161
164
|
|
|
162
165
|
# Sketchy record descriptor detection
|
|
@@ -178,7 +181,7 @@ def schema_to_descriptor(schema):
|
|
|
178
181
|
return record.RecordDescriptor(name, fields)
|
|
179
182
|
|
|
180
183
|
|
|
181
|
-
def avro_type_to_flow_type(ftype):
|
|
184
|
+
def avro_type_to_flow_type(ftype: list) -> str:
|
|
182
185
|
ftypes = [ftype] if not isinstance(ftype, list) else ftype
|
|
183
186
|
|
|
184
187
|
# If a field can be null, it has an additional type of "null"
|
|
@@ -1,5 +1,8 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import Iterator, Union
|
|
2
|
+
|
|
3
|
+
from flow.record import Record, RecordOutput, RecordStreamReader, open_file, open_path
|
|
2
4
|
from flow.record.adapter import AbstractReader, AbstractWriter
|
|
5
|
+
from flow.record.selector import Selector
|
|
3
6
|
from flow.record.utils import is_stdout
|
|
4
7
|
|
|
5
8
|
__usage__ = """
|
|
@@ -15,20 +18,20 @@ class StreamWriter(AbstractWriter):
|
|
|
15
18
|
fp = None
|
|
16
19
|
stream = None
|
|
17
20
|
|
|
18
|
-
def __init__(self, path, clobber=True, **kwargs):
|
|
19
|
-
self.fp =
|
|
20
|
-
self.stream =
|
|
21
|
+
def __init__(self, path: str, clobber=True, **kwargs):
|
|
22
|
+
self.fp = open_path(path, "wb", clobber=clobber)
|
|
23
|
+
self.stream = RecordOutput(self.fp)
|
|
21
24
|
|
|
22
|
-
def write(self,
|
|
23
|
-
self.stream.write(
|
|
25
|
+
def write(self, record: Record) -> None:
|
|
26
|
+
self.stream.write(record)
|
|
24
27
|
|
|
25
|
-
def flush(self):
|
|
28
|
+
def flush(self) -> None:
|
|
26
29
|
if self.stream and hasattr(self.stream, "flush"):
|
|
27
30
|
self.stream.flush()
|
|
28
31
|
if self.fp:
|
|
29
32
|
self.fp.flush()
|
|
30
33
|
|
|
31
|
-
def close(self):
|
|
34
|
+
def close(self) -> None:
|
|
32
35
|
if self.stream:
|
|
33
36
|
self.stream.close()
|
|
34
37
|
self.stream = None
|
|
@@ -42,14 +45,14 @@ class StreamReader(AbstractReader):
|
|
|
42
45
|
fp = None
|
|
43
46
|
stream = None
|
|
44
47
|
|
|
45
|
-
def __init__(self, path, selector=None, **kwargs):
|
|
46
|
-
self.fp =
|
|
47
|
-
self.stream =
|
|
48
|
+
def __init__(self, path: str, selector: Union[str, Selector] = None, **kwargs):
|
|
49
|
+
self.fp = open_file(path, "rb")
|
|
50
|
+
self.stream = RecordStreamReader(self.fp, selector=selector)
|
|
48
51
|
|
|
49
|
-
def __iter__(self):
|
|
52
|
+
def __iter__(self) -> Iterator[Record]:
|
|
50
53
|
return iter(self.stream)
|
|
51
54
|
|
|
52
|
-
def close(self):
|
|
55
|
+
def close(self) -> None:
|
|
53
56
|
if self.stream:
|
|
54
57
|
self.stream.close()
|
|
55
58
|
self.stream = None
|
|
@@ -14,10 +14,12 @@ import sys
|
|
|
14
14
|
import warnings
|
|
15
15
|
from datetime import datetime, timezone
|
|
16
16
|
from itertools import zip_longest
|
|
17
|
-
from
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import IO, Any, BinaryIO, Iterator, Mapping, Optional, Sequence, Union
|
|
18
19
|
from urllib.parse import parse_qsl, urlparse
|
|
19
20
|
|
|
20
|
-
from .
|
|
21
|
+
from flow.record.adapter import AbstractReader, AbstractWriter
|
|
22
|
+
from flow.record.exceptions import RecordAdapterNotFound, RecordDescriptorError
|
|
21
23
|
|
|
22
24
|
try:
|
|
23
25
|
import lz4.frame as lz4
|
|
@@ -38,6 +40,13 @@ try:
|
|
|
38
40
|
except ImportError:
|
|
39
41
|
HAS_ZSTD = False
|
|
40
42
|
|
|
43
|
+
try:
|
|
44
|
+
import fastavro as avro # noqa
|
|
45
|
+
|
|
46
|
+
HAS_AVRO = True
|
|
47
|
+
except ImportError:
|
|
48
|
+
HAS_AVRO = False
|
|
49
|
+
|
|
41
50
|
from collections import OrderedDict
|
|
42
51
|
|
|
43
52
|
from .utils import to_native_str, to_str
|
|
@@ -63,6 +72,10 @@ GZIP_MAGIC = b"\x1f\x8b"
|
|
|
63
72
|
BZ2_MAGIC = b"BZh"
|
|
64
73
|
LZ4_MAGIC = b"\x04\x22\x4d\x18"
|
|
65
74
|
ZSTD_MAGIC = b"\x28\xb5\x2f\xfd"
|
|
75
|
+
AVRO_MAGIC = b"Obj"
|
|
76
|
+
|
|
77
|
+
RECORDSTREAM_MAGIC = b"RECORDSTREAM\n"
|
|
78
|
+
RECORDSTREAM_MAGIC_DEPTH = 4 + 2 + len(RECORDSTREAM_MAGIC)
|
|
66
79
|
|
|
67
80
|
RE_VALID_FIELD_NAME = re.compile(r"^_?[a-zA-Z][a-zA-Z0-9_]*(?:\[\])?$")
|
|
68
81
|
RE_VALID_RECORD_TYPE_NAME = re.compile("^[a-zA-Z][a-zA-Z0-9_]*(/[a-zA-Z][a-zA-Z0-9_]*)*$")
|
|
@@ -83,37 +96,6 @@ class {name}(Record):
|
|
|
83
96
|
"""
|
|
84
97
|
|
|
85
98
|
|
|
86
|
-
class Peekable:
|
|
87
|
-
"""Wrapper class for adding .peek() to a file object."""
|
|
88
|
-
|
|
89
|
-
def __init__(self, fd):
|
|
90
|
-
self.fd = fd
|
|
91
|
-
self.buffer = None
|
|
92
|
-
|
|
93
|
-
def peek(self, size):
|
|
94
|
-
if self.buffer is not None:
|
|
95
|
-
raise BufferError("Only 1 peek allowed")
|
|
96
|
-
data = self.fd.read(size)
|
|
97
|
-
self.buffer = io.BytesIO(data)
|
|
98
|
-
return data
|
|
99
|
-
|
|
100
|
-
def read(self, size=None):
|
|
101
|
-
data = b""
|
|
102
|
-
if self.buffer is None:
|
|
103
|
-
data = self.fd.read(size)
|
|
104
|
-
else:
|
|
105
|
-
data = self.buffer.read(size)
|
|
106
|
-
if len(data) < size:
|
|
107
|
-
data += self.fd.read(size - len(data))
|
|
108
|
-
self.buffer = None
|
|
109
|
-
return data
|
|
110
|
-
|
|
111
|
-
def close(self):
|
|
112
|
-
self.buffer = None
|
|
113
|
-
self.fd.close()
|
|
114
|
-
self.fd = None
|
|
115
|
-
|
|
116
|
-
|
|
117
99
|
class FieldType:
|
|
118
100
|
def _typename(self):
|
|
119
101
|
t = type(self)
|
|
@@ -339,7 +321,7 @@ class RecordFieldSet(list):
|
|
|
339
321
|
|
|
340
322
|
|
|
341
323
|
@functools.lru_cache(maxsize=4096)
|
|
342
|
-
def _generate_record_class(name: str, fields:
|
|
324
|
+
def _generate_record_class(name: str, fields: tuple[tuple[str, str]]) -> type:
|
|
343
325
|
"""Generate a record class
|
|
344
326
|
|
|
345
327
|
Args:
|
|
@@ -442,9 +424,9 @@ class RecordDescriptor:
|
|
|
442
424
|
_desc_hash: int = None
|
|
443
425
|
_fields: Mapping[str, RecordField] = None
|
|
444
426
|
_all_fields: Mapping[str, RecordField] = None
|
|
445
|
-
_field_tuples: Sequence[
|
|
427
|
+
_field_tuples: Sequence[tuple[str, str]] = None
|
|
446
428
|
|
|
447
|
-
def __init__(self, name: str, fields: Optional[Sequence[
|
|
429
|
+
def __init__(self, name: str, fields: Optional[Sequence[tuple[str, str]]] = None):
|
|
448
430
|
if not name:
|
|
449
431
|
raise RecordDescriptorError("Record name is required")
|
|
450
432
|
|
|
@@ -548,7 +530,7 @@ class RecordDescriptor:
|
|
|
548
530
|
"""Create a new Record initialized with `args` and `kwargs`."""
|
|
549
531
|
return self.recordType(*args, **kwargs)
|
|
550
532
|
|
|
551
|
-
def init_from_dict(self, rdict:
|
|
533
|
+
def init_from_dict(self, rdict: dict[str, Any], raise_unknown=False) -> Record:
|
|
552
534
|
"""Create a new Record initialized with key, value pairs from `rdict`.
|
|
553
535
|
|
|
554
536
|
If `raise_unknown=True` then fields on `rdict` that are unknown to this
|
|
@@ -575,7 +557,7 @@ class RecordDescriptor:
|
|
|
575
557
|
"""
|
|
576
558
|
return self.init_from_dict(record._asdict(), raise_unknown=raise_unknown)
|
|
577
559
|
|
|
578
|
-
def extend(self, fields: Sequence[
|
|
560
|
+
def extend(self, fields: Sequence[tuple[str, str]]) -> RecordDescriptor:
|
|
579
561
|
"""Returns a new RecordDescriptor with the extended fields
|
|
580
562
|
|
|
581
563
|
Returns:
|
|
@@ -584,7 +566,7 @@ class RecordDescriptor:
|
|
|
584
566
|
new_fields = list(self.get_field_tuples()) + fields
|
|
585
567
|
return RecordDescriptor(self.name, new_fields)
|
|
586
568
|
|
|
587
|
-
def get_field_tuples(self) ->
|
|
569
|
+
def get_field_tuples(self) -> tuple[tuple[str, str]]:
|
|
588
570
|
"""Returns a tuple containing the (typename, name) tuples, eg:
|
|
589
571
|
|
|
590
572
|
(('boolean', 'foo'), ('string', 'bar'))
|
|
@@ -596,7 +578,7 @@ class RecordDescriptor:
|
|
|
596
578
|
|
|
597
579
|
@staticmethod
|
|
598
580
|
@functools.lru_cache(maxsize=256)
|
|
599
|
-
def calc_descriptor_hash(name, fields: Sequence[
|
|
581
|
+
def calc_descriptor_hash(name, fields: Sequence[tuple[str, str]]) -> int:
|
|
600
582
|
"""Calculate and return the (cached) descriptor hash as a 32 bit integer.
|
|
601
583
|
|
|
602
584
|
The descriptor hash is the first 4 bytes of the sha256sum of the descriptor name and field names and types.
|
|
@@ -612,7 +594,7 @@ class RecordDescriptor:
|
|
|
612
594
|
return self._desc_hash
|
|
613
595
|
|
|
614
596
|
@property
|
|
615
|
-
def identifier(self) ->
|
|
597
|
+
def identifier(self) -> tuple[str, int]:
|
|
616
598
|
"""Returns a tuple containing the descriptor name and hash"""
|
|
617
599
|
return (self.name, self.descriptor_hash)
|
|
618
600
|
|
|
@@ -650,11 +632,11 @@ class RecordDescriptor:
|
|
|
650
632
|
|
|
651
633
|
return wrapper
|
|
652
634
|
|
|
653
|
-
def _pack(self) ->
|
|
635
|
+
def _pack(self) -> tuple[str, tuple[tuple[str, str]]]:
|
|
654
636
|
return (self.name, self._field_tuples)
|
|
655
637
|
|
|
656
638
|
@staticmethod
|
|
657
|
-
def _unpack(name, fields:
|
|
639
|
+
def _unpack(name, fields: tuple[tuple[str, str]]) -> RecordDescriptor:
|
|
658
640
|
return RecordDescriptor(name, fields)
|
|
659
641
|
|
|
660
642
|
|
|
@@ -662,17 +644,66 @@ def DynamicDescriptor(name, fields):
|
|
|
662
644
|
return RecordDescriptor(name, [("dynamic", field) for field in fields])
|
|
663
645
|
|
|
664
646
|
|
|
665
|
-
def
|
|
647
|
+
def open_stream(fp: BinaryIO, mode: str) -> BinaryIO:
|
|
648
|
+
if not hasattr(fp, "peek"):
|
|
649
|
+
fp = io.BufferedReader(fp)
|
|
650
|
+
|
|
651
|
+
# We peek into the file at the maximum possible length we might need, which is the amount of bytes needed to
|
|
652
|
+
# determine whether a stream is a RECORDSTREAM or not.
|
|
653
|
+
peek_data = fp.peek(RECORDSTREAM_MAGIC_DEPTH)
|
|
654
|
+
|
|
655
|
+
# If the data stream is compressed, we wrap the file pointer in a reader that can decompress accordingly.
|
|
656
|
+
if peek_data[:2] == GZIP_MAGIC:
|
|
657
|
+
fp = gzip.GzipFile(fileobj=fp, mode=mode)
|
|
658
|
+
elif HAS_BZ2 and peek_data[:3] == BZ2_MAGIC:
|
|
659
|
+
fp = bz2.BZ2File(fp, mode=mode)
|
|
660
|
+
elif HAS_LZ4 and peek_data[:4] == LZ4_MAGIC:
|
|
661
|
+
fp = lz4.open(fp, mode=mode)
|
|
662
|
+
elif HAS_ZSTD and peek_data[:4] == ZSTD_MAGIC:
|
|
663
|
+
dctx = zstd.ZstdDecompressor()
|
|
664
|
+
fp = dctx.stream_reader(fp)
|
|
665
|
+
|
|
666
|
+
return fp
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def find_adapter_for_stream(fp: BinaryIO) -> tuple[BinaryIO, Optional[str]]:
|
|
670
|
+
# We need to peek into the stream to be able to determine which adapter is needed. The fp given to this function
|
|
671
|
+
# might already be an instance of the 'Peekable' class, but might also be a different file pointer, for example
|
|
672
|
+
# a transparent decompressor. As calling peek() twice on the same peekable is not allowed, we wrap the fp into
|
|
673
|
+
# a Peekable again, so that we are able to determine the correct adapter.
|
|
674
|
+
if not hasattr(fp, "peek"):
|
|
675
|
+
fp = io.BufferedReader(fp)
|
|
676
|
+
|
|
677
|
+
peek_data = fp.peek(RECORDSTREAM_MAGIC_DEPTH)
|
|
678
|
+
if HAS_AVRO and peek_data[:3] == AVRO_MAGIC:
|
|
679
|
+
return fp, "avro"
|
|
680
|
+
elif RECORDSTREAM_MAGIC in peek_data[:RECORDSTREAM_MAGIC_DEPTH]:
|
|
681
|
+
return fp, "stream"
|
|
682
|
+
return fp, None
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def open_file(path: Union[str, Path, BinaryIO], mode: str, clobber: bool = True) -> IO:
|
|
686
|
+
if isinstance(path, Path):
|
|
687
|
+
path = str(path)
|
|
688
|
+
if isinstance(path, str):
|
|
689
|
+
return open_path(path, mode, clobber)
|
|
690
|
+
elif isinstance(path, io.IOBase):
|
|
691
|
+
return open_stream(path, "rb")
|
|
692
|
+
else:
|
|
693
|
+
raise ValueError(f"Unsupported path type {path}")
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def open_path(path: str, mode: str, clobber: bool = True) -> IO:
|
|
666
697
|
"""
|
|
667
|
-
Open
|
|
698
|
+
Open ``path`` using ``mode`` and returns a file object.
|
|
668
699
|
|
|
669
700
|
It handles special cases if path is meant to be stdin or stdout.
|
|
670
701
|
And also supports compression based on extension or file header of stream.
|
|
671
702
|
|
|
672
703
|
Args:
|
|
673
|
-
path
|
|
674
|
-
mode
|
|
675
|
-
clobber
|
|
704
|
+
path: Filename or path to filename to open
|
|
705
|
+
mode: Could be "r", "rb" to open file for reading, "w", "wb" for writing
|
|
706
|
+
clobber: Overwrite file if it already exists if `clobber=True`, else raises IOError.
|
|
676
707
|
|
|
677
708
|
"""
|
|
678
709
|
binary = "b" in mode
|
|
@@ -724,24 +755,18 @@ def open_path(path, mode, clobber=True):
|
|
|
724
755
|
fp = io.open(path, mode)
|
|
725
756
|
# check if we are reading a compressed stream
|
|
726
757
|
if not out and binary:
|
|
727
|
-
|
|
728
|
-
fp = Peekable(fp)
|
|
729
|
-
peek_data = fp.peek(4)
|
|
730
|
-
if peek_data[:2] == GZIP_MAGIC:
|
|
731
|
-
fp = gzip.GzipFile(fileobj=fp, mode=mode)
|
|
732
|
-
elif HAS_BZ2 and peek_data[:3] == BZ2_MAGIC:
|
|
733
|
-
fp = bz2.BZ2File(fp, mode=mode)
|
|
734
|
-
elif HAS_LZ4 and peek_data[:4] == LZ4_MAGIC:
|
|
735
|
-
fp = lz4.open(fp, mode=mode)
|
|
736
|
-
elif HAS_ZSTD and peek_data[:4] == ZSTD_MAGIC:
|
|
737
|
-
dctx = zstd.ZstdDecompressor()
|
|
738
|
-
fp = dctx.stream_reader(fp)
|
|
758
|
+
fp = open_stream(fp, mode)
|
|
739
759
|
return fp
|
|
740
760
|
|
|
741
761
|
|
|
742
|
-
def RecordAdapter(
|
|
743
|
-
url
|
|
744
|
-
|
|
762
|
+
def RecordAdapter(
|
|
763
|
+
url: Optional[str] = None,
|
|
764
|
+
out: bool = False,
|
|
765
|
+
selector: Optional[str] = None,
|
|
766
|
+
clobber: bool = True,
|
|
767
|
+
fileobj: Optional[BinaryIO] = None,
|
|
768
|
+
**kwargs,
|
|
769
|
+
) -> Union[AbstractWriter, AbstractReader]:
|
|
745
770
|
# Guess adapter based on extension
|
|
746
771
|
ext_to_adapter = {
|
|
747
772
|
".avro": "avro",
|
|
@@ -749,42 +774,94 @@ def RecordAdapter(url, out, selector=None, clobber=True, **kwargs):
|
|
|
749
774
|
".jsonl": "jsonfile",
|
|
750
775
|
".csv": "csvfile",
|
|
751
776
|
}
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
777
|
+
cls_stream = None
|
|
778
|
+
cls_url = None
|
|
779
|
+
adapter = None
|
|
780
|
+
|
|
781
|
+
# When a url is given, we interpret it to determine what kind of adapter we need. This piece of logic is always
|
|
782
|
+
# necessary for the RecordWriter (as it does not currently support file-like objects), and only needed for
|
|
783
|
+
# RecordReader if a url is provided.
|
|
784
|
+
if out is True or url not in ("-", "", None):
|
|
785
|
+
# Either stdout / stdin is given, or a path-like string.
|
|
786
|
+
url = str(url or "")
|
|
787
|
+
_, ext = os.path.splitext(url)
|
|
788
|
+
|
|
789
|
+
adapter_scheme = ext_to_adapter.get(ext, "stream")
|
|
790
|
+
if "://" not in url:
|
|
791
|
+
url = f"{adapter_scheme}://{url}"
|
|
792
|
+
p = urlparse(url, scheme=adapter_scheme)
|
|
793
|
+
adapter, _, sub_adapter = p.scheme.partition("+")
|
|
794
|
+
|
|
795
|
+
arg_dict = dict(parse_qsl(p.query))
|
|
796
|
+
arg_dict.update(kwargs)
|
|
797
|
+
|
|
798
|
+
cls_url = p.netloc + p.path
|
|
799
|
+
if sub_adapter:
|
|
800
|
+
cls_url = sub_adapter + "://" + cls_url
|
|
801
|
+
elif url in ("-", ""):
|
|
802
|
+
# For reading stdin, we cannot rely on an extension to know what sort of stream is incoming. Thus, we will treat
|
|
803
|
+
# it as a 'fileobj', where we can peek into the stream and try to select the appropriate adapter.
|
|
804
|
+
fileobj = getattr(sys.stdin, "buffer", sys.stdin)
|
|
805
|
+
if fileobj is not None:
|
|
806
|
+
# This record adapter has received a file-like object for record reading
|
|
807
|
+
# We just need to find the right adapter by peeking into the first few bytes.
|
|
808
|
+
|
|
809
|
+
# First, we open the stream. If the stream is compressed, open_stream will wrap it for us into a decompressor.
|
|
810
|
+
cls_stream = open_stream(fileobj, "rb")
|
|
811
|
+
|
|
812
|
+
# Now, we have a stream that will be transparently decompressed but we still do not know what adapter to use.
|
|
813
|
+
# This requires a new peek into the transparent stream. This peek will cause the stream pointer to be moved.
|
|
814
|
+
# Therefore, find_adapter_for_stream returns both a BinaryIO-supportive object that can correctly read the
|
|
815
|
+
# adjusted stream, and a string indicating the type of adapter to be used on said stream.
|
|
816
|
+
arg_dict = kwargs.copy()
|
|
817
|
+
|
|
818
|
+
# If a user did not provide a url, we have to peek into the stream to be able to determine the right adapter
|
|
819
|
+
# based on magic bytes encountered in the first few bytes of the stream.
|
|
820
|
+
if adapter is None:
|
|
821
|
+
cls_stream, adapter = find_adapter_for_stream(cls_stream)
|
|
822
|
+
if adapter is None:
|
|
823
|
+
peek_data = cls_stream.peek(RECORDSTREAM_MAGIC_DEPTH)
|
|
824
|
+
if peek_data and peek_data.startswith(b"<"):
|
|
825
|
+
# As peek() can result in a larger buffer than requested, we make sure the peek_data variable isn't
|
|
826
|
+
# unnecessarily long in the error message.
|
|
827
|
+
peek_data = peek_data[:RECORDSTREAM_MAGIC_DEPTH]
|
|
828
|
+
raise RecordAdapterNotFound(
|
|
829
|
+
(
|
|
830
|
+
f"Could not find a reader for input {peek_data!r}. Are you perhaps "
|
|
831
|
+
"entering record text, rather than a record stream? This can be fixed by using "
|
|
832
|
+
"'rdump -w -' to write a record stream to stdout."
|
|
833
|
+
)
|
|
834
|
+
)
|
|
835
|
+
raise RecordAdapterNotFound("Could not find adapter for file-like object")
|
|
836
|
+
|
|
837
|
+
# Now that we know which adapter is needed, we import it.
|
|
761
838
|
mod = importlib.import_module("flow.record.adapter.{}".format(adapter))
|
|
762
|
-
|
|
763
839
|
clsname = ("{}Writer" if out else "{}Reader").format(adapter.title())
|
|
764
840
|
|
|
765
841
|
cls = getattr(mod, clsname)
|
|
766
|
-
arg_dict = dict(parse_qsl(p.query))
|
|
767
|
-
arg_dict.update(kwargs)
|
|
768
|
-
cls_url = p.netloc + p.path
|
|
769
|
-
if sub_adapter:
|
|
770
|
-
cls_url = sub_adapter + "://" + cls_url
|
|
771
|
-
|
|
772
842
|
if not out and selector:
|
|
773
843
|
arg_dict["selector"] = selector
|
|
774
844
|
|
|
775
845
|
if out:
|
|
776
846
|
arg_dict["clobber"] = clobber
|
|
777
|
-
|
|
778
847
|
log.debug("Creating {!r} for {!r} with args {!r}".format(cls, url, arg_dict))
|
|
848
|
+
|
|
849
|
+
if fileobj is not None:
|
|
850
|
+
return cls(cls_stream, **arg_dict)
|
|
779
851
|
return cls(cls_url, **arg_dict)
|
|
780
852
|
|
|
781
853
|
|
|
782
|
-
def RecordReader(
|
|
783
|
-
|
|
854
|
+
def RecordReader(
|
|
855
|
+
url: Optional[str] = None,
|
|
856
|
+
selector: Optional[str] = None,
|
|
857
|
+
fileobj: Optional[BinaryIO] = None,
|
|
858
|
+
**kwargs,
|
|
859
|
+
) -> AbstractReader:
|
|
860
|
+
return RecordAdapter(url=url, out=False, selector=selector, fileobj=fileobj, **kwargs)
|
|
784
861
|
|
|
785
862
|
|
|
786
|
-
def RecordWriter(url=None, clobber=True, **kwargs):
|
|
787
|
-
return RecordAdapter(url, True, clobber=clobber, **kwargs)
|
|
863
|
+
def RecordWriter(url: Optional[str] = None, clobber: bool = True, **kwargs) -> AbstractWriter:
|
|
864
|
+
return RecordAdapter(url=url, out=True, clobber=clobber, **kwargs)
|
|
788
865
|
|
|
789
866
|
|
|
790
867
|
def stream(src, dst):
|
|
@@ -834,7 +911,7 @@ def fieldtype(clspath: str) -> FieldType:
|
|
|
834
911
|
|
|
835
912
|
@functools.lru_cache(maxsize=4069)
|
|
836
913
|
def merge_record_descriptors(
|
|
837
|
-
descriptors:
|
|
914
|
+
descriptors: tuple[RecordDescriptor], replace: bool = False, name: Optional[str] = None
|
|
838
915
|
) -> RecordDescriptor:
|
|
839
916
|
"""Create a newly merged RecordDescriptor from a list of RecordDescriptors.
|
|
840
917
|
This function uses a cache to avoid creating the same descriptor multiple times.
|
|
@@ -861,7 +938,7 @@ def merge_record_descriptors(
|
|
|
861
938
|
|
|
862
939
|
|
|
863
940
|
def extend_record(
|
|
864
|
-
record: Record, other_records:
|
|
941
|
+
record: Record, other_records: list[Record], replace: bool = False, name: Optional[str] = None
|
|
865
942
|
) -> Record:
|
|
866
943
|
"""Extend ``record`` with fields and values from ``other_records``.
|
|
867
944
|
|
|
@@ -15,9 +15,14 @@ from typing import Any, Optional, Tuple
|
|
|
15
15
|
from urllib.parse import urlparse
|
|
16
16
|
|
|
17
17
|
try:
|
|
18
|
-
|
|
18
|
+
try:
|
|
19
|
+
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
20
|
+
except ImportError:
|
|
21
|
+
from backports.zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
22
|
+
HAS_ZONE_INFO = True
|
|
19
23
|
except ImportError:
|
|
20
|
-
|
|
24
|
+
HAS_ZONE_INFO = False
|
|
25
|
+
|
|
21
26
|
|
|
22
27
|
from flow.record.base import FieldType
|
|
23
28
|
|
|
@@ -50,9 +55,16 @@ def flow_record_tz(*, default_tz: str = "UTC") -> Optional[ZoneInfo | UTC]:
|
|
|
50
55
|
Returns:
|
|
51
56
|
None if ``FLOW_RECORD_TZ=NONE`` otherwise ``ZoneInfo(FLOW_RECORD_TZ)`` or ``UTC`` if ZoneInfo is not found.
|
|
52
57
|
"""
|
|
58
|
+
|
|
53
59
|
tz = os.environ.get("FLOW_RECORD_TZ", default_tz)
|
|
54
60
|
if tz.upper() == "NONE":
|
|
55
61
|
return None
|
|
62
|
+
|
|
63
|
+
if not HAS_ZONE_INFO:
|
|
64
|
+
if tz != "UTC":
|
|
65
|
+
warnings.warn("Cannot use FLOW_RECORD_TZ due to missing zoneinfo module, defaulting to 'UTC'.")
|
|
66
|
+
return UTC
|
|
67
|
+
|
|
56
68
|
try:
|
|
57
69
|
return ZoneInfo(tz)
|
|
58
70
|
except ZoneInfoNotFoundError as exc:
|
|
@@ -8,7 +8,7 @@ import sys
|
|
|
8
8
|
from collections import ChainMap
|
|
9
9
|
from functools import lru_cache
|
|
10
10
|
|
|
11
|
-
from flow.record import RecordWriter
|
|
11
|
+
from flow.record import RECORDSTREAM_MAGIC, RecordWriter
|
|
12
12
|
from flow.record.fieldtypes import fieldtype_for_value
|
|
13
13
|
from flow.record.selector import make_selector
|
|
14
14
|
|
|
@@ -17,8 +17,6 @@ from .packer import RecordPacker
|
|
|
17
17
|
|
|
18
18
|
log = logging.getLogger(__package__)
|
|
19
19
|
|
|
20
|
-
RECORDSTREAM_MAGIC = b"RECORDSTREAM\n"
|
|
21
|
-
|
|
22
20
|
|
|
23
21
|
def RecordOutput(fp):
|
|
24
22
|
"""Return a RecordPrinter if `fp` is a tty otherwise a RecordStreamWriter."""
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# file generated by setuptools_scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
TYPE_CHECKING = False
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from typing import Tuple, Union
|
|
6
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
|
+
else:
|
|
8
|
+
VERSION_TUPLE = object
|
|
9
|
+
|
|
10
|
+
version: str
|
|
11
|
+
__version__: str
|
|
12
|
+
__version_tuple__: VERSION_TUPLE
|
|
13
|
+
version_tuple: VERSION_TUPLE
|
|
14
|
+
|
|
15
|
+
__version__ = version = '3.13.dev2'
|
|
16
|
+
__version_tuple__ = version_tuple = (3, 13, 'dev2')
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flow.record
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.13.dev2
|
|
4
4
|
Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
|
|
5
5
|
Author-email: Dissect Team <dissect@fox-it.com>
|
|
6
6
|
License: Affero General Public License v3
|
|
@@ -18,13 +18,24 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
18
18
|
Classifier: Topic :: Utilities
|
|
19
19
|
Requires-Python: ~=3.7
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
License-File: COPYRIGHT
|
|
23
|
+
Requires-Dist: msgpack>=0.5.2
|
|
24
|
+
Requires-Dist: backports.zoneinfo[tzdata]; python_version < "3.9"
|
|
25
|
+
Requires-Dist: tzdata; platform_system == "Windows"
|
|
21
26
|
Provides-Extra: compression
|
|
27
|
+
Requires-Dist: lz4; extra == "compression"
|
|
28
|
+
Requires-Dist: zstandard; extra == "compression"
|
|
22
29
|
Provides-Extra: elastic
|
|
30
|
+
Requires-Dist: elasticsearch; extra == "elastic"
|
|
23
31
|
Provides-Extra: geoip
|
|
32
|
+
Requires-Dist: maxminddb; extra == "geoip"
|
|
24
33
|
Provides-Extra: avro
|
|
34
|
+
Requires-Dist: fastavro[snappy]; extra == "avro"
|
|
25
35
|
Provides-Extra: test
|
|
26
|
-
|
|
27
|
-
|
|
36
|
+
Requires-Dist: lz4; extra == "test"
|
|
37
|
+
Requires-Dist: zstandard; extra == "test"
|
|
38
|
+
Requires-Dist: fastavro; extra == "test"
|
|
28
39
|
|
|
29
40
|
# flow.record
|
|
30
41
|
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from flow.record import RecordDescriptor, RecordReader
|
|
6
|
+
from flow.record.adapter.avro import AvroReader, AvroWriter
|
|
7
|
+
from flow.record.base import HAS_AVRO
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def generate_records(amount):
|
|
11
|
+
TestRecordWithFooBar = RecordDescriptor(
|
|
12
|
+
"test/record",
|
|
13
|
+
[
|
|
14
|
+
("string", "name"),
|
|
15
|
+
("string", "foo"),
|
|
16
|
+
("string", "bar"),
|
|
17
|
+
],
|
|
18
|
+
)
|
|
19
|
+
for i in range(amount):
|
|
20
|
+
yield TestRecordWithFooBar(name=f"record{i}", foo="bar", bar="baz")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_writing_reading_avrofile(tmp_path):
|
|
24
|
+
if not HAS_AVRO:
|
|
25
|
+
pytest.skip("fastavro module not installed")
|
|
26
|
+
avro_path = tmp_path / "test.avro"
|
|
27
|
+
|
|
28
|
+
out = AvroWriter(avro_path)
|
|
29
|
+
for rec in generate_records(100):
|
|
30
|
+
out.write(rec)
|
|
31
|
+
out.close()
|
|
32
|
+
|
|
33
|
+
reader = AvroReader(avro_path)
|
|
34
|
+
for index, rec in enumerate(reader):
|
|
35
|
+
assert rec.name == f"record{index}"
|
|
36
|
+
assert rec.foo == "bar"
|
|
37
|
+
assert rec.bar == "baz"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_avrostream_filelike_object(tmp_path):
|
|
41
|
+
if not HAS_AVRO:
|
|
42
|
+
pytest.skip("fastavro module not installed")
|
|
43
|
+
avro_path = tmp_path / "test.avro"
|
|
44
|
+
|
|
45
|
+
out = AvroWriter(avro_path)
|
|
46
|
+
for rec in generate_records(100):
|
|
47
|
+
out.write(rec)
|
|
48
|
+
out.close()
|
|
49
|
+
|
|
50
|
+
with open(avro_path, "rb") as avro_file:
|
|
51
|
+
avro_buffer = avro_file.read()
|
|
52
|
+
|
|
53
|
+
avro_io = BytesIO(avro_buffer)
|
|
54
|
+
|
|
55
|
+
reader = RecordReader(fileobj=avro_io)
|
|
56
|
+
|
|
57
|
+
# The record reader should automatically have created an 'AvroReader' to handle the Avro Record Stream
|
|
58
|
+
assert isinstance(reader, AvroReader)
|
|
59
|
+
|
|
60
|
+
# Verify if selector worked and records are the same
|
|
61
|
+
for index, rec in enumerate(reader):
|
|
62
|
+
assert rec.name == f"record{index}"
|
|
63
|
+
assert rec.foo == "bar"
|
|
64
|
+
assert rec.bar == "baz"
|
|
@@ -67,7 +67,7 @@ def test_rdump_pipe(tmp_path):
|
|
|
67
67
|
)
|
|
68
68
|
stdout, stderr = p2.communicate()
|
|
69
69
|
assert stdout.strip() == b""
|
|
70
|
-
assert b"
|
|
70
|
+
assert b"Are you perhaps entering record text, rather than a record stream?" in stderr.strip()
|
|
71
71
|
|
|
72
72
|
# rdump test.records -w - | rdump -s 'r.count in (1, 3, 9)' -w filtered.records
|
|
73
73
|
path2 = tmp_path / "filtered.records"
|
|
@@ -461,6 +461,43 @@ def test_rdump_headerless_csv(tmp_path, capsysbinary):
|
|
|
461
461
|
]
|
|
462
462
|
|
|
463
463
|
|
|
464
|
+
def test_rdump_stdin_peek(tmp_path):
|
|
465
|
+
if platform.system() == "Windows":
|
|
466
|
+
pytest.skip("No Gzip on Windows")
|
|
467
|
+
|
|
468
|
+
TestRecord = RecordDescriptor(
|
|
469
|
+
"test/record",
|
|
470
|
+
[
|
|
471
|
+
("varint", "count"),
|
|
472
|
+
("string", "foo"),
|
|
473
|
+
],
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
path = tmp_path / "test.records"
|
|
477
|
+
writer = RecordWriter(path)
|
|
478
|
+
# generate some test records
|
|
479
|
+
for i in range(10):
|
|
480
|
+
writer.write(TestRecord(count=i, foo="bar"))
|
|
481
|
+
writer.close()
|
|
482
|
+
|
|
483
|
+
# Gzip compress records file
|
|
484
|
+
compress_cmd = ["gzip", "--keep", str(path)]
|
|
485
|
+
subprocess.check_output(compress_cmd)
|
|
486
|
+
compressed_path = str(path) + ".gz"
|
|
487
|
+
|
|
488
|
+
# Rdump should transparently decompress and select the correct adapter
|
|
489
|
+
p1 = subprocess.Popen(["cat", compressed_path], stdout=subprocess.PIPE)
|
|
490
|
+
p2 = subprocess.Popen(
|
|
491
|
+
["rdump", "-s", "r.count == 5"],
|
|
492
|
+
stdin=p1.stdout,
|
|
493
|
+
stdout=subprocess.PIPE,
|
|
494
|
+
stderr=subprocess.PIPE,
|
|
495
|
+
)
|
|
496
|
+
stdout, _ = p2.communicate()
|
|
497
|
+
|
|
498
|
+
assert stdout.strip() in (b"<test/record count=5 foo='bar'>", b"<test/record count=5L foo=u'bar'>")
|
|
499
|
+
|
|
500
|
+
|
|
464
501
|
@pytest.mark.parametrize(
|
|
465
502
|
("total_records", "count", "skip", "expected_numbers"),
|
|
466
503
|
[
|
|
@@ -18,6 +18,7 @@ from flow.record import (
|
|
|
18
18
|
RecordStreamReader,
|
|
19
19
|
RecordWriter,
|
|
20
20
|
)
|
|
21
|
+
from flow.record.adapter.stream import StreamReader
|
|
21
22
|
from flow.record.base import (
|
|
22
23
|
BZ2_MAGIC,
|
|
23
24
|
GZIP_MAGIC,
|
|
@@ -46,6 +47,26 @@ def test_stream_writer_reader():
|
|
|
46
47
|
assert set([2, 7]) == set([r.number for r in records])
|
|
47
48
|
|
|
48
49
|
|
|
50
|
+
def test_recordstream_filelike_object():
|
|
51
|
+
fp = StringIO()
|
|
52
|
+
out = RecordOutput(fp)
|
|
53
|
+
for rec in generate_records():
|
|
54
|
+
out.write(rec)
|
|
55
|
+
|
|
56
|
+
fp.seek(0)
|
|
57
|
+
reader = RecordReader(fileobj=fp, selector="r.number in (6, 9)")
|
|
58
|
+
|
|
59
|
+
# The record reader should automatically have created a 'StreamReader' to handle the Record Stream.
|
|
60
|
+
assert isinstance(reader, StreamReader)
|
|
61
|
+
|
|
62
|
+
# Verify if selector worked and records are the same
|
|
63
|
+
records = []
|
|
64
|
+
for rec in reader:
|
|
65
|
+
records.append(rec)
|
|
66
|
+
|
|
67
|
+
assert set([6, 9]) == set([r.number for r in records])
|
|
68
|
+
|
|
69
|
+
|
|
49
70
|
@pytest.mark.parametrize("PSelector", [Selector, CompiledSelector])
|
|
50
71
|
def test_file_writer_reader(tmpdir, PSelector):
|
|
51
72
|
p = tmpdir.join("test.records")
|
|
@@ -104,6 +125,15 @@ def test_compressed_writer_reader(tmpdir, compression):
|
|
|
104
125
|
|
|
105
126
|
assert numbers == list(range(count))
|
|
106
127
|
|
|
128
|
+
# Using a file-handle instead of a path should also work
|
|
129
|
+
with open(path, "rb") as fh:
|
|
130
|
+
reader = RecordReader(fileobj=fh)
|
|
131
|
+
numbers = []
|
|
132
|
+
for rec in reader:
|
|
133
|
+
numbers.append(rec.number)
|
|
134
|
+
|
|
135
|
+
assert numbers == list(range(count))
|
|
136
|
+
|
|
107
137
|
|
|
108
138
|
def test_path_template_writer(tmpdir):
|
|
109
139
|
TestRecord = RecordDescriptor(
|
|
@@ -5,7 +5,8 @@ import pathlib
|
|
|
5
5
|
import subprocess
|
|
6
6
|
import sys
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
|
-
from
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from unittest.mock import MagicMock, mock_open, patch
|
|
9
10
|
|
|
10
11
|
import msgpack
|
|
11
12
|
import pytest
|
|
@@ -589,7 +590,11 @@ def test_record_adapter_windows_path(tmp_path):
|
|
|
589
590
|
writer.write(TestRecord("foo"))
|
|
590
591
|
writer.write(TestRecord("bar"))
|
|
591
592
|
|
|
592
|
-
|
|
593
|
+
test_read_buf = BytesIO(path_records.read_bytes())
|
|
594
|
+
mock_reader = MagicMock(wraps=test_read_buf, spec=BytesIO)
|
|
595
|
+
|
|
596
|
+
with patch("io.open", MagicMock(return_value=mock_reader)) as m:
|
|
597
|
+
m.return_value.closed = False
|
|
593
598
|
adapter = RecordReader(r"c:\users\user\test.records")
|
|
594
599
|
assert type(adapter).__name__ == "StreamReader"
|
|
595
600
|
m.assert_called_once_with(r"c:\users\user\test.records", "rb")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|