flow.record 3.15.dev4__tar.gz → 3.15.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flow.record-3.15.dev4/flow.record.egg-info → flow.record-3.15.dev6}/PKG-INFO +8 -4
- flow.record-3.15.dev6/flow/record/adapter/duckdb.py +56 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/sqlite.py +39 -21
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/stream.py +2 -1
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/version.py +2 -2
- {flow.record-3.15.dev4 → flow.record-3.15.dev6/flow.record.egg-info}/PKG-INFO +8 -4
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/SOURCES.txt +2 -1
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/requires.txt +10 -3
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/pyproject.toml +8 -3
- flow.record-3.15.dev4/tests/test_sqlite_adapter.py → flow.record-3.15.dev6/tests/test_sqlite_duckdb_adapter.py +81 -41
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tox.ini +1 -1
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/COPYRIGHT +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/LICENSE +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/MANIFEST.in +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/README.md +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/examples/filesystem.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/examples/passivedns.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/examples/records.json +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/examples/tcpconn.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/__init__.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/__init__.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/archive.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/avro.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/broker.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/csvfile.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/elastic.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/jsonfile.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/line.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/mongo.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/split.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/splunk.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/stream.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/text.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/xlsx.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/base.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/exceptions.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/__init__.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/credential.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/__init__.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/ip.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/ipv4.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/tcp.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/udp.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/jsonpacker.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/packer.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/selector.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/tools/__init__.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/tools/geoip.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/tools/rdump.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/utils.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/whitelist.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/dependency_links.txt +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/entry_points.txt +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/top_level.txt +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/setup.cfg +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/__init__.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/_utils.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/docs/Makefile +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/docs/conf.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/docs/index.rst +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/selector_explain_example.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/standalone_test.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_avro.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_avro_adapter.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_compiled_selector.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_csv_adapter.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_deprecations.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_fieldtype_ip.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_fieldtypes.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_json_packer.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_json_record_adapter.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_multi_timestamp.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_packer.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_rdump.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_record.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_record_adapter.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_record_descriptor.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_regression.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_selector.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_splunk_adapter.py +0 -0
- {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/utils_inspect.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flow.record
|
|
3
|
-
Version: 3.15.
|
|
3
|
+
Version: 3.15.dev6
|
|
4
4
|
Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
|
|
5
5
|
Author-email: Dissect Team <dissect@fox-it.com>
|
|
6
6
|
License: Affero General Public License v3
|
|
@@ -32,10 +32,14 @@ Provides-Extra: geoip
|
|
|
32
32
|
Requires-Dist: maxminddb; extra == "geoip"
|
|
33
33
|
Provides-Extra: avro
|
|
34
34
|
Requires-Dist: fastavro[snappy]; extra == "avro"
|
|
35
|
+
Provides-Extra: duckdb
|
|
36
|
+
Requires-Dist: duckdb; extra == "duckdb"
|
|
37
|
+
Requires-Dist: pytz; extra == "duckdb"
|
|
35
38
|
Provides-Extra: test
|
|
36
|
-
Requires-Dist:
|
|
37
|
-
Requires-Dist:
|
|
38
|
-
Requires-Dist:
|
|
39
|
+
Requires-Dist: flow.record[compression]; extra == "test"
|
|
40
|
+
Requires-Dist: flow.record[avro]; extra == "test"
|
|
41
|
+
Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
|
|
42
|
+
Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
|
|
39
43
|
|
|
40
44
|
# flow.record
|
|
41
45
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import duckdb
|
|
6
|
+
|
|
7
|
+
from flow.record.adapter.sqlite import (
|
|
8
|
+
Selector,
|
|
9
|
+
SqliteReader,
|
|
10
|
+
SqliteWriter,
|
|
11
|
+
make_selector,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
__usage__ = """
|
|
17
|
+
DuckDB adapter
|
|
18
|
+
---
|
|
19
|
+
Write usage: rdump -w duckdb://[PATH]?batch_size=[BATCH_SIZE]
|
|
20
|
+
Read usage: rdump duckdb://[PATH]?batch_size=[BATCH_SIZE]
|
|
21
|
+
[PATH]: path to DuckDB database file
|
|
22
|
+
|
|
23
|
+
Optional parameters:
|
|
24
|
+
[BATCH_SIZE]: number of records to read or write in a single transaction (default: 1000)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DuckdbReader(SqliteReader):
|
|
29
|
+
"""DuckDB reader, subclasses from SQLite reader."""
|
|
30
|
+
|
|
31
|
+
logger = logger
|
|
32
|
+
|
|
33
|
+
def __init__(self, path: str, *, batch_size: str | int = 1000, selector: Selector | str | None = None, **kwargs):
|
|
34
|
+
self.selector = make_selector(selector)
|
|
35
|
+
self.descriptors_seen = set()
|
|
36
|
+
self.con = duckdb.connect(path)
|
|
37
|
+
self.count = 0
|
|
38
|
+
self.batch_size = int(batch_size)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DuckdbWriter(SqliteWriter):
|
|
42
|
+
"""DuckDB writer, subclasses from SQLite writer."""
|
|
43
|
+
|
|
44
|
+
logger = logger
|
|
45
|
+
|
|
46
|
+
def __init__(self, path: str, *, batch_size: str | int = 1000, **kwargs):
|
|
47
|
+
self.descriptors_seen = set()
|
|
48
|
+
self.con = None
|
|
49
|
+
self.con = duckdb.connect(path)
|
|
50
|
+
self.count = 0
|
|
51
|
+
self.batch_size = int(batch_size)
|
|
52
|
+
self.con.begin()
|
|
53
|
+
|
|
54
|
+
def tx_cycle(self) -> None:
|
|
55
|
+
self.con.commit()
|
|
56
|
+
self.con.begin()
|
|
@@ -18,7 +18,7 @@ SQLite adapter
|
|
|
18
18
|
---
|
|
19
19
|
Write usage: rdump -w sqlite://[PATH]?batch_size=[BATCH_SIZE]
|
|
20
20
|
Read usage: rdump sqlite://[PATH]?batch_size=[BATCH_SIZE]
|
|
21
|
-
[PATH]: path to
|
|
21
|
+
[PATH]: path to SQLite database file
|
|
22
22
|
|
|
23
23
|
Optional parameters:
|
|
24
24
|
[BATCH_SIZE]: number of records to read or write in a single transaction (default: 1000)
|
|
@@ -28,12 +28,12 @@ Optional parameters:
|
|
|
28
28
|
FIELD_MAP = {
|
|
29
29
|
"int": "INTEGER",
|
|
30
30
|
"uint32": "INTEGER",
|
|
31
|
-
"varint": "
|
|
31
|
+
"varint": "BIGINT",
|
|
32
32
|
"float": "REAL",
|
|
33
33
|
"boolean": "INTEGER",
|
|
34
34
|
"bytes": "BLOB",
|
|
35
|
-
"filesize": "
|
|
36
|
-
"datetime": "
|
|
35
|
+
"filesize": "BIGINT",
|
|
36
|
+
"datetime": "TIMESTAMPTZ",
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
|
|
@@ -41,12 +41,15 @@ FIELD_MAP = {
|
|
|
41
41
|
SQLITE_FIELD_MAP = {
|
|
42
42
|
"VARCHAR": "string",
|
|
43
43
|
"INTEGER": "varint",
|
|
44
|
+
"BIGINT": "varint",
|
|
44
45
|
"BLOB": "bytes",
|
|
45
46
|
"REAL": "float",
|
|
46
47
|
"DOUBLE": "float",
|
|
47
48
|
"BOOLEAN": "boolean",
|
|
48
49
|
"DATETIME": "datetime",
|
|
49
50
|
"TIMESTAMP": "datetime",
|
|
51
|
+
"TIMESTAMPTZ": "datetime",
|
|
52
|
+
"TIMESTAMP WITH TIME ZONE": "datetime",
|
|
50
53
|
}
|
|
51
54
|
|
|
52
55
|
|
|
@@ -58,11 +61,11 @@ def create_descriptor_table(con: sqlite3.Connection, descriptor: RecordDescripto
|
|
|
58
61
|
column_defs = []
|
|
59
62
|
for column_name, fieldset in descriptor.get_all_fields().items():
|
|
60
63
|
column_type = FIELD_MAP.get(fieldset.typename, "TEXT")
|
|
61
|
-
column_defs.append(f"
|
|
64
|
+
column_defs.append(f' "{column_name}" {column_type}')
|
|
62
65
|
sql_columns = ",\n".join(column_defs)
|
|
63
66
|
|
|
64
67
|
# Create the descriptor table
|
|
65
|
-
sql = f
|
|
68
|
+
sql = f'CREATE TABLE IF NOT EXISTS "{table_name}" (\n{sql_columns}\n)'
|
|
66
69
|
logger.debug(sql)
|
|
67
70
|
con.execute(sql)
|
|
68
71
|
|
|
@@ -72,7 +75,7 @@ def update_descriptor_columns(con: sqlite3.Connection, descriptor: RecordDescrip
|
|
|
72
75
|
table_name = descriptor.name
|
|
73
76
|
|
|
74
77
|
# Get existing columns
|
|
75
|
-
cursor = con.execute(f
|
|
78
|
+
cursor = con.execute(f'PRAGMA table_info("{table_name}")')
|
|
76
79
|
column_names = set(row[1] for row in cursor.fetchall())
|
|
77
80
|
|
|
78
81
|
# Add missing columns
|
|
@@ -81,23 +84,23 @@ def update_descriptor_columns(con: sqlite3.Connection, descriptor: RecordDescrip
|
|
|
81
84
|
if column_name in column_names:
|
|
82
85
|
continue
|
|
83
86
|
column_type = FIELD_MAP.get(fieldset.typename, "TEXT")
|
|
84
|
-
column_defs.append(f
|
|
87
|
+
column_defs.append(f' ALTER TABLE "{table_name}" ADD COLUMN "{column_name}" {column_type}')
|
|
85
88
|
|
|
86
89
|
# No missing columns
|
|
87
90
|
if not column_defs:
|
|
88
91
|
return None
|
|
89
92
|
|
|
90
93
|
# Add the new columns
|
|
91
|
-
|
|
92
|
-
|
|
94
|
+
for col_def in column_defs:
|
|
95
|
+
con.execute(col_def)
|
|
93
96
|
|
|
94
97
|
|
|
95
98
|
@lru_cache(maxsize=1000)
|
|
96
99
|
def prepare_insert_sql(table_name: str, field_names: tuple[str]) -> str:
|
|
97
100
|
"""Return (cached) prepared SQL statement for inserting a record based on table name and field names."""
|
|
98
|
-
column_names = ", ".join(f"
|
|
101
|
+
column_names = ", ".join(f'"{name}"' for name in field_names)
|
|
99
102
|
value_placeholder = ", ".join(["?"] * len(field_names))
|
|
100
|
-
return f
|
|
103
|
+
return f'INSERT INTO "{table_name}" ({column_names}) VALUES ({value_placeholder})'
|
|
101
104
|
|
|
102
105
|
|
|
103
106
|
def db_insert_record(con: sqlite3.Connection, record: Record) -> None:
|
|
@@ -123,7 +126,11 @@ def db_insert_record(con: sqlite3.Connection, record: Record) -> None:
|
|
|
123
126
|
|
|
124
127
|
|
|
125
128
|
class SqliteReader(AbstractReader):
|
|
126
|
-
|
|
129
|
+
"""SQLite reader."""
|
|
130
|
+
|
|
131
|
+
logger = logger
|
|
132
|
+
|
|
133
|
+
def __init__(self, path: str, *, batch_size: str | int = 1000, selector: Selector | str | None = None, **kwargs):
|
|
127
134
|
self.selector = make_selector(selector)
|
|
128
135
|
self.descriptors_seen = set()
|
|
129
136
|
self.con = sqlite3.connect(path)
|
|
@@ -140,7 +147,7 @@ class SqliteReader(AbstractReader):
|
|
|
140
147
|
|
|
141
148
|
# flow.record is quite strict with what is allowed in fieldnames or decriptor name.
|
|
142
149
|
# While SQLite is less strict, we need to sanitize the names to make them compatible.
|
|
143
|
-
table_name_org = table_name
|
|
150
|
+
table_name_org = table_name.replace('"', '""')
|
|
144
151
|
table_name = normalize_fieldname(table_name)
|
|
145
152
|
|
|
146
153
|
schema = self.con.execute(
|
|
@@ -161,8 +168,8 @@ class SqliteReader(AbstractReader):
|
|
|
161
168
|
fnames.append(fname)
|
|
162
169
|
|
|
163
170
|
descriptor_cls = RecordDescriptor(table_name, fields)
|
|
164
|
-
table_name_org = table_name_org.replace("
|
|
165
|
-
cursor = self.con.execute(f
|
|
171
|
+
table_name_org = table_name_org.replace('"', '""')
|
|
172
|
+
cursor = self.con.execute(f'SELECT * FROM "{table_name_org}"')
|
|
166
173
|
while True:
|
|
167
174
|
rows = cursor.fetchmany(self.batch_size)
|
|
168
175
|
if not rows:
|
|
@@ -186,19 +193,24 @@ class SqliteReader(AbstractReader):
|
|
|
186
193
|
def __iter__(self) -> Iterator[Record]:
|
|
187
194
|
"""Iterate over all tables in the database and yield records."""
|
|
188
195
|
for table_name in self.table_names():
|
|
189
|
-
|
|
196
|
+
self.logger.debug("Reading table: %s", table_name)
|
|
190
197
|
for record in self.read_table(table_name):
|
|
191
198
|
if not self.selector or self.selector.match(record):
|
|
192
199
|
yield record
|
|
193
200
|
|
|
194
201
|
|
|
195
202
|
class SqliteWriter(AbstractWriter):
|
|
196
|
-
|
|
203
|
+
"""SQLite writer."""
|
|
204
|
+
|
|
205
|
+
logger = logger
|
|
206
|
+
|
|
207
|
+
def __init__(self, path: str, *, batch_size: str | int = 1000, **kwargs):
|
|
197
208
|
self.descriptors_seen = set()
|
|
198
209
|
self.con = None
|
|
199
|
-
self.con = sqlite3.connect(path)
|
|
210
|
+
self.con = sqlite3.connect(path, isolation_level=None)
|
|
200
211
|
self.count = 0
|
|
201
212
|
self.batch_size = int(batch_size)
|
|
213
|
+
self.tx_cycle()
|
|
202
214
|
|
|
203
215
|
def write(self, record: Record) -> None:
|
|
204
216
|
"""Write a record to the database"""
|
|
@@ -207,17 +219,23 @@ class SqliteWriter(AbstractWriter):
|
|
|
207
219
|
self.descriptors_seen.add(desc)
|
|
208
220
|
create_descriptor_table(self.con, desc)
|
|
209
221
|
update_descriptor_columns(self.con, desc)
|
|
222
|
+
self.flush()
|
|
210
223
|
|
|
211
224
|
db_insert_record(self.con, record)
|
|
212
225
|
self.count += 1
|
|
213
226
|
|
|
214
227
|
# Commit every batch_size records
|
|
215
228
|
if self.count % self.batch_size == 0:
|
|
216
|
-
self.
|
|
229
|
+
self.flush()
|
|
230
|
+
|
|
231
|
+
def tx_cycle(self) -> None:
|
|
232
|
+
if self.con.in_transaction:
|
|
233
|
+
self.con.execute("COMMIT")
|
|
234
|
+
self.con.execute("BEGIN")
|
|
217
235
|
|
|
218
236
|
def flush(self) -> None:
|
|
219
237
|
if self.con:
|
|
220
|
-
self.
|
|
238
|
+
self.tx_cycle()
|
|
221
239
|
|
|
222
240
|
def close(self) -> None:
|
|
223
241
|
if self.con:
|
|
@@ -256,7 +256,8 @@ class RecordFieldRewriter:
|
|
|
256
256
|
self.exclude = exclude or []
|
|
257
257
|
self.expression = compile(expression, "<string>", "exec") if expression else None
|
|
258
258
|
|
|
259
|
-
|
|
259
|
+
self.record_descriptor_for_fields = lru_cache(256)(self.record_descriptor_for_fields)
|
|
260
|
+
|
|
260
261
|
def record_descriptor_for_fields(self, descriptor, fields=None, exclude=None, new_fields=None):
|
|
261
262
|
if not fields and not exclude and not new_fields:
|
|
262
263
|
return descriptor
|
|
@@ -12,5 +12,5 @@ __version__: str
|
|
|
12
12
|
__version_tuple__: VERSION_TUPLE
|
|
13
13
|
version_tuple: VERSION_TUPLE
|
|
14
14
|
|
|
15
|
-
__version__ = version = '3.15.
|
|
16
|
-
__version_tuple__ = version_tuple = (3, 15, '
|
|
15
|
+
__version__ = version = '3.15.dev6'
|
|
16
|
+
__version_tuple__ = version_tuple = (3, 15, 'dev6')
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flow.record
|
|
3
|
-
Version: 3.15.
|
|
3
|
+
Version: 3.15.dev6
|
|
4
4
|
Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
|
|
5
5
|
Author-email: Dissect Team <dissect@fox-it.com>
|
|
6
6
|
License: Affero General Public License v3
|
|
@@ -32,10 +32,14 @@ Provides-Extra: geoip
|
|
|
32
32
|
Requires-Dist: maxminddb; extra == "geoip"
|
|
33
33
|
Provides-Extra: avro
|
|
34
34
|
Requires-Dist: fastavro[snappy]; extra == "avro"
|
|
35
|
+
Provides-Extra: duckdb
|
|
36
|
+
Requires-Dist: duckdb; extra == "duckdb"
|
|
37
|
+
Requires-Dist: pytz; extra == "duckdb"
|
|
35
38
|
Provides-Extra: test
|
|
36
|
-
Requires-Dist:
|
|
37
|
-
Requires-Dist:
|
|
38
|
-
Requires-Dist:
|
|
39
|
+
Requires-Dist: flow.record[compression]; extra == "test"
|
|
40
|
+
Requires-Dist: flow.record[avro]; extra == "test"
|
|
41
|
+
Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
|
|
42
|
+
Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
|
|
39
43
|
|
|
40
44
|
# flow.record
|
|
41
45
|
|
|
@@ -29,6 +29,7 @@ flow/record/adapter/archive.py
|
|
|
29
29
|
flow/record/adapter/avro.py
|
|
30
30
|
flow/record/adapter/broker.py
|
|
31
31
|
flow/record/adapter/csvfile.py
|
|
32
|
+
flow/record/adapter/duckdb.py
|
|
32
33
|
flow/record/adapter/elastic.py
|
|
33
34
|
flow/record/adapter/jsonfile.py
|
|
34
35
|
flow/record/adapter/line.py
|
|
@@ -71,7 +72,7 @@ tests/test_record_descriptor.py
|
|
|
71
72
|
tests/test_regression.py
|
|
72
73
|
tests/test_selector.py
|
|
73
74
|
tests/test_splunk_adapter.py
|
|
74
|
-
tests/
|
|
75
|
+
tests/test_sqlite_duckdb_adapter.py
|
|
75
76
|
tests/utils_inspect.py
|
|
76
77
|
tests/docs/Makefile
|
|
77
78
|
tests/docs/conf.py
|
|
@@ -13,6 +13,10 @@ fastavro[snappy]
|
|
|
13
13
|
lz4
|
|
14
14
|
zstandard
|
|
15
15
|
|
|
16
|
+
[duckdb]
|
|
17
|
+
duckdb
|
|
18
|
+
pytz
|
|
19
|
+
|
|
16
20
|
[elastic]
|
|
17
21
|
elasticsearch
|
|
18
22
|
|
|
@@ -20,6 +24,9 @@ elasticsearch
|
|
|
20
24
|
maxminddb
|
|
21
25
|
|
|
22
26
|
[test]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
27
|
+
flow.record[compression]
|
|
28
|
+
flow.record[avro]
|
|
29
|
+
|
|
30
|
+
[test:platform_python_implementation != "PyPy" and python_version < "3.12"]
|
|
31
|
+
duckdb
|
|
32
|
+
pytz
|
|
@@ -49,10 +49,15 @@ geoip = [
|
|
|
49
49
|
avro = [
|
|
50
50
|
"fastavro[snappy]",
|
|
51
51
|
]
|
|
52
|
+
duckdb = [
|
|
53
|
+
"duckdb",
|
|
54
|
+
"pytz", # duckdb requires pytz for timezone support
|
|
55
|
+
]
|
|
52
56
|
test = [
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
57
|
+
"flow.record[compression]",
|
|
58
|
+
"flow.record[avro]",
|
|
59
|
+
"duckdb; platform_python_implementation != 'PyPy' and python_version < '3.12'", # duckdb
|
|
60
|
+
"pytz; platform_python_implementation != 'PyPy' and python_version < '3.12'", # duckdb
|
|
56
61
|
]
|
|
57
62
|
|
|
58
63
|
[project.scripts]
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
import sqlite3
|
|
2
2
|
from datetime import datetime, timezone
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Iterator
|
|
4
|
+
from typing import Any, Iterator, NamedTuple
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import duckdb
|
|
8
|
+
except ModuleNotFoundError:
|
|
9
|
+
duckdb = None
|
|
5
10
|
|
|
6
11
|
import pytest
|
|
7
12
|
|
|
@@ -11,6 +16,26 @@ from flow.record.base import normalize_fieldname
|
|
|
11
16
|
from flow.record.exceptions import RecordDescriptorError
|
|
12
17
|
|
|
13
18
|
|
|
19
|
+
class Database(NamedTuple):
|
|
20
|
+
scheme: str
|
|
21
|
+
connector: Any
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# We test for sqlite3 and duckdb (if available)
|
|
25
|
+
if duckdb is None:
|
|
26
|
+
databases = [
|
|
27
|
+
Database("sqlite", sqlite3),
|
|
28
|
+
]
|
|
29
|
+
else:
|
|
30
|
+
databases = [
|
|
31
|
+
Database("sqlite", sqlite3),
|
|
32
|
+
Database("duckdb", duckdb),
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
# pytest fixture that will run the test for each database in the databases list
|
|
36
|
+
sqlite_duckdb_parametrize = pytest.mark.parametrize("db", databases, ids=[db.scheme for db in databases])
|
|
37
|
+
|
|
38
|
+
|
|
14
39
|
def generate_records(amount: int) -> Iterator[Record]:
|
|
15
40
|
"""Generates some test records"""
|
|
16
41
|
TestRecordWithFooBar = RecordDescriptor(
|
|
@@ -34,11 +59,12 @@ def generate_records(amount: int) -> Iterator[Record]:
|
|
|
34
59
|
"_my_movies",
|
|
35
60
|
],
|
|
36
61
|
)
|
|
37
|
-
|
|
62
|
+
@sqlite_duckdb_parametrize
|
|
63
|
+
def test_table_name_sanitization(tmp_path: Path, table_name: str, db: Database) -> None:
|
|
38
64
|
"""Ensure that we can read table names that are technically invalid in flow.record."""
|
|
39
|
-
|
|
40
|
-
con =
|
|
41
|
-
con.execute(f"CREATE TABLE '{table_name}' (title TEXT, year INTEGER, score
|
|
65
|
+
db_path = tmp_path / "records.db"
|
|
66
|
+
con = db.connector.connect(str(db_path))
|
|
67
|
+
con.execute(f"CREATE TABLE '{table_name}' (title TEXT, year INTEGER, score DOUBLE)")
|
|
42
68
|
data = [
|
|
43
69
|
("Monty Python Live at the Hollywood Bowl", 1982, 7.9),
|
|
44
70
|
("Monty Python's The Meaning of Life", 1983, 7.5),
|
|
@@ -49,7 +75,7 @@ def test_table_name_sanitization(tmp_path: Path, table_name: str) -> None:
|
|
|
49
75
|
con.close()
|
|
50
76
|
|
|
51
77
|
data_records = []
|
|
52
|
-
with RecordReader(f"
|
|
78
|
+
with RecordReader(f"{db.scheme}://{db_path}") as reader:
|
|
53
79
|
data_records = [(record.title, record.year, record.score) for record in reader]
|
|
54
80
|
assert data == data_records
|
|
55
81
|
|
|
@@ -63,11 +89,12 @@ def test_table_name_sanitization(tmp_path: Path, table_name: str) -> None:
|
|
|
63
89
|
"1337_starting_with_number",
|
|
64
90
|
],
|
|
65
91
|
)
|
|
66
|
-
|
|
92
|
+
@sqlite_duckdb_parametrize
|
|
93
|
+
def test_field_name_sanitization(tmp_path: Path, field_name: str, db: Database) -> None:
|
|
67
94
|
"""Ensure that we can read field names that are technically invalid in flow.record."""
|
|
68
|
-
|
|
69
|
-
con =
|
|
70
|
-
con.execute(f
|
|
95
|
+
db_path = tmp_path / "records.db"
|
|
96
|
+
con = db.connector.connect(str(db_path))
|
|
97
|
+
con.execute(f'CREATE TABLE "my_table" ("{field_name}" TEXT)')
|
|
71
98
|
data = [
|
|
72
99
|
("hello",),
|
|
73
100
|
("world",),
|
|
@@ -81,7 +108,7 @@ def test_field_name_sanitization(tmp_path: Path, field_name: str) -> None:
|
|
|
81
108
|
data_records = []
|
|
82
109
|
sanitized_field_name = normalize_fieldname(field_name)
|
|
83
110
|
|
|
84
|
-
with RecordReader(f"
|
|
111
|
+
with RecordReader(f"{db.scheme}://{db_path}") as reader:
|
|
85
112
|
data_records = [(getattr(record, sanitized_field_name),) for record in reader]
|
|
86
113
|
assert data == data_records
|
|
87
114
|
|
|
@@ -95,20 +122,21 @@ def test_field_name_sanitization(tmp_path: Path, field_name: str) -> None:
|
|
|
95
122
|
2000,
|
|
96
123
|
],
|
|
97
124
|
)
|
|
98
|
-
|
|
125
|
+
@sqlite_duckdb_parametrize
|
|
126
|
+
def test_write_to_sqlite(tmp_path: Path, count: int, db: Database) -> None:
|
|
99
127
|
"""Tests writing records to a SQLite database."""
|
|
100
|
-
|
|
101
|
-
with RecordWriter(f"
|
|
128
|
+
db_path = tmp_path / "records.db"
|
|
129
|
+
with RecordWriter(f"{db.scheme}://{db_path}") as writer:
|
|
102
130
|
for record in generate_records(count):
|
|
103
131
|
writer.write(record)
|
|
104
132
|
|
|
105
133
|
record_count = 0
|
|
106
|
-
with
|
|
134
|
+
with db.connector.connect(str(db_path)) as con:
|
|
107
135
|
cursor = con.execute("SELECT COUNT(*) FROM 'test/record'")
|
|
108
136
|
record_count = cursor.fetchone()[0]
|
|
109
137
|
|
|
110
138
|
cursor = con.execute("SELECT * FROM 'test/record'")
|
|
111
|
-
for index, row in enumerate(cursor):
|
|
139
|
+
for index, row in enumerate(cursor.fetchall()):
|
|
112
140
|
assert row[0] == f"record{index}"
|
|
113
141
|
assert row[1] == "bar"
|
|
114
142
|
assert row[2] == "baz"
|
|
@@ -119,18 +147,19 @@ def test_write_to_sqlite(tmp_path: Path, count: int) -> None:
|
|
|
119
147
|
assert record_count == count
|
|
120
148
|
|
|
121
149
|
|
|
122
|
-
|
|
150
|
+
@sqlite_duckdb_parametrize
|
|
151
|
+
def test_read_from_sqlite(tmp_path: Path, db: Database) -> None:
|
|
123
152
|
"""Tests basic reading from a SQLite database."""
|
|
124
153
|
# Generate a SQLite database
|
|
125
|
-
|
|
126
|
-
with
|
|
154
|
+
db_path = tmp_path / "records.db"
|
|
155
|
+
with db.connector.connect(str(db_path)) as con:
|
|
127
156
|
con.execute(
|
|
128
157
|
"""
|
|
129
158
|
CREATE TABLE 'test/record' (
|
|
130
159
|
name TEXT,
|
|
131
160
|
data BLOB,
|
|
132
|
-
datetime
|
|
133
|
-
score
|
|
161
|
+
datetime TIMESTAMPTZ,
|
|
162
|
+
score DOUBLE
|
|
134
163
|
)
|
|
135
164
|
"""
|
|
136
165
|
)
|
|
@@ -143,7 +172,7 @@ def test_read_from_sqlite(tmp_path: Path) -> None:
|
|
|
143
172
|
)
|
|
144
173
|
|
|
145
174
|
# Read the SQLite database using flow.record
|
|
146
|
-
with RecordReader(f"
|
|
175
|
+
with RecordReader(f"{db.scheme}://{db_path}") as reader:
|
|
147
176
|
for i, record in enumerate(reader, start=1):
|
|
148
177
|
assert isinstance(record.name, str)
|
|
149
178
|
assert isinstance(record.datetime, datetime)
|
|
@@ -153,12 +182,14 @@ def test_read_from_sqlite(tmp_path: Path) -> None:
|
|
|
153
182
|
assert record.name == f"record{i}"
|
|
154
183
|
assert record.data == f"foobar{i}".encode()
|
|
155
184
|
assert record.datetime == datetime(2023, 10, i, 13, 37, tzinfo=timezone.utc)
|
|
185
|
+
assert str(record.datetime) == f"2023-10-{i:02d} 13:37:00+00:00"
|
|
156
186
|
assert record.score == 3.14 + i
|
|
157
187
|
|
|
158
188
|
|
|
159
|
-
|
|
189
|
+
@sqlite_duckdb_parametrize
|
|
190
|
+
def test_write_dynamic_descriptor(tmp_path: Path, db: Database) -> None:
|
|
160
191
|
"""Test the ability to write records with different descriptors to the same table."""
|
|
161
|
-
|
|
192
|
+
db_path = tmp_path / "records.db"
|
|
162
193
|
TestRecord = RecordDescriptor(
|
|
163
194
|
"test/dynamic",
|
|
164
195
|
[
|
|
@@ -179,7 +210,7 @@ def test_write_dynamic_descriptor(tmp_path: Path) -> None:
|
|
|
179
210
|
)
|
|
180
211
|
|
|
181
212
|
# We should be able to write records with different descriptors to the same table
|
|
182
|
-
with RecordWriter(f"
|
|
213
|
+
with RecordWriter(f"{db.scheme}://{db_path}") as writer:
|
|
183
214
|
record1 = TestRecord(name="record1", foo="bar", bar="baz")
|
|
184
215
|
writer.write(record1)
|
|
185
216
|
record2 = TestRecord_extra(name="record2", foo="bar", bar="baz", extra="extra", extra2="extra2")
|
|
@@ -187,7 +218,7 @@ def test_write_dynamic_descriptor(tmp_path: Path) -> None:
|
|
|
187
218
|
|
|
188
219
|
# The read table should be a combination of both descriptors
|
|
189
220
|
record_count = 0
|
|
190
|
-
with RecordReader(f"
|
|
221
|
+
with RecordReader(f"{db.scheme}://{db_path}") as reader:
|
|
191
222
|
for record_count, record in enumerate(reader, start=1):
|
|
192
223
|
assert record._desc.get_field_tuples() == (
|
|
193
224
|
("string", "name"),
|
|
@@ -206,14 +237,15 @@ def test_write_dynamic_descriptor(tmp_path: Path) -> None:
|
|
|
206
237
|
assert record_count == 2
|
|
207
238
|
|
|
208
239
|
|
|
209
|
-
|
|
240
|
+
@sqlite_duckdb_parametrize
|
|
241
|
+
def test_write_zero_records(tmp_path: Path, db: Database) -> None:
|
|
210
242
|
"""Test writing zero records."""
|
|
211
|
-
|
|
212
|
-
with RecordWriter(f"
|
|
243
|
+
db_path = tmp_path / "records.db"
|
|
244
|
+
with RecordWriter(f"{db.scheme}://{db_path}") as writer:
|
|
213
245
|
assert writer
|
|
214
246
|
|
|
215
247
|
# test if it's a valid database
|
|
216
|
-
with
|
|
248
|
+
with db.connector.connect(str(db_path)) as con:
|
|
217
249
|
assert con.execute("SELECT * FROM sqlite_master").fetchall() == []
|
|
218
250
|
|
|
219
251
|
|
|
@@ -295,7 +327,7 @@ def test_invalid_field_names_quoting(tmp_path: Path, invalid_field_name: str) ->
|
|
|
295
327
|
def test_prepare_insert_sql():
|
|
296
328
|
table_name = "my_table"
|
|
297
329
|
field_names = ("name", "age", "email")
|
|
298
|
-
expected_sql =
|
|
330
|
+
expected_sql = 'INSERT INTO "my_table" ("name", "age", "email") VALUES (?, ?, ?)'
|
|
299
331
|
assert prepare_insert_sql(table_name, field_names) == expected_sql
|
|
300
332
|
|
|
301
333
|
|
|
@@ -308,17 +340,24 @@ def test_prepare_insert_sql():
|
|
|
308
340
|
(1000, 0, 1000),
|
|
309
341
|
],
|
|
310
342
|
)
|
|
311
|
-
|
|
343
|
+
@sqlite_duckdb_parametrize
|
|
344
|
+
def test_batch_size(
|
|
345
|
+
tmp_path: Path,
|
|
346
|
+
batch_size: int,
|
|
347
|
+
expected_first: int,
|
|
348
|
+
expected_second: int,
|
|
349
|
+
db: Database,
|
|
350
|
+
) -> None:
|
|
312
351
|
"""Test that batch_size is respected when writing records."""
|
|
313
352
|
records = generate_records(batch_size + 100)
|
|
314
353
|
db_path = tmp_path / "records.db"
|
|
315
|
-
with RecordWriter(f"
|
|
354
|
+
with RecordWriter(f"{db.scheme}://{db_path}?batch_size={batch_size}") as writer:
|
|
316
355
|
# write a single record, should not be flushed yet if batch_size > 1
|
|
317
356
|
writer.write(next(records))
|
|
318
357
|
|
|
319
358
|
# test count of records in table (no flush yet if batch_size > 1)
|
|
320
|
-
with
|
|
321
|
-
x = con.execute(
|
|
359
|
+
with db.connector.connect(str(db_path)) as con:
|
|
360
|
+
x = con.execute('SELECT COUNT(*) FROM "test/record"')
|
|
322
361
|
assert x.fetchone()[0] is expected_first
|
|
323
362
|
|
|
324
363
|
# write at least batch_size records, should be flushed due to batch_size
|
|
@@ -326,23 +365,24 @@ def test_batch_size(tmp_path: Path, batch_size: int, expected_first: int, expect
|
|
|
326
365
|
writer.write(next(records))
|
|
327
366
|
|
|
328
367
|
# test count of records in table after flush
|
|
329
|
-
with
|
|
330
|
-
x = con.execute(
|
|
368
|
+
with db.connector.connect(str(db_path)) as con:
|
|
369
|
+
x = con.execute('SELECT COUNT(*) FROM "test/record"')
|
|
331
370
|
assert x.fetchone()[0] == expected_second
|
|
332
371
|
|
|
333
372
|
|
|
334
|
-
|
|
373
|
+
@sqlite_duckdb_parametrize
|
|
374
|
+
def test_selector(tmp_path: Path, db: Database) -> None:
|
|
335
375
|
"""Test selector when reading records."""
|
|
336
376
|
db_path = tmp_path / "records.db"
|
|
337
|
-
with RecordWriter(f"
|
|
377
|
+
with RecordWriter(f"{db.scheme}://{db_path}") as writer:
|
|
338
378
|
for record in generate_records(10):
|
|
339
379
|
writer.write(record)
|
|
340
380
|
|
|
341
|
-
with RecordReader(f"
|
|
381
|
+
with RecordReader(f"{db.scheme}://{db_path}", selector="r.name == 'record5'") as reader:
|
|
342
382
|
records = list(reader)
|
|
343
383
|
assert len(records) == 1
|
|
344
384
|
assert records[0].name == "record5"
|
|
345
385
|
|
|
346
|
-
with RecordReader(f"
|
|
386
|
+
with RecordReader(f"{db.scheme}://{db_path}", selector="r.name == 'record12345'") as reader:
|
|
347
387
|
records = list(reader)
|
|
348
388
|
assert len(records) == 0
|
|
@@ -4,7 +4,7 @@ envlist = lint, py3, pypy3
|
|
|
4
4
|
# requires if they are not available on the host system. This requires the
|
|
5
5
|
# locally installed tox to have a minimum version 3.3.0. This means the names
|
|
6
6
|
# of the configuration options are still according to the tox 3.x syntax.
|
|
7
|
-
minversion = 4.
|
|
7
|
+
minversion = 4.11.4
|
|
8
8
|
# This version of virtualenv will install setuptools version 65.5.0 and pip
|
|
9
9
|
# 22.3. These versions fully support python projects defined only through a
|
|
10
10
|
# pyproject.toml file (PEP-517/PEP-518/PEP-621)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|