flow.record 3.15.dev4__tar.gz → 3.15.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {flow.record-3.15.dev4/flow.record.egg-info → flow.record-3.15.dev6}/PKG-INFO +8 -4
  2. flow.record-3.15.dev6/flow/record/adapter/duckdb.py +56 -0
  3. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/sqlite.py +39 -21
  4. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/stream.py +2 -1
  5. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/version.py +2 -2
  6. {flow.record-3.15.dev4 → flow.record-3.15.dev6/flow.record.egg-info}/PKG-INFO +8 -4
  7. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/SOURCES.txt +2 -1
  8. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/requires.txt +10 -3
  9. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/pyproject.toml +8 -3
  10. flow.record-3.15.dev4/tests/test_sqlite_adapter.py → flow.record-3.15.dev6/tests/test_sqlite_duckdb_adapter.py +81 -41
  11. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tox.ini +1 -1
  12. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/COPYRIGHT +0 -0
  13. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/LICENSE +0 -0
  14. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/MANIFEST.in +0 -0
  15. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/README.md +0 -0
  16. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/examples/filesystem.py +0 -0
  17. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/examples/passivedns.py +0 -0
  18. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/examples/records.json +0 -0
  19. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/examples/tcpconn.py +0 -0
  20. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/__init__.py +0 -0
  21. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/__init__.py +0 -0
  22. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/archive.py +0 -0
  23. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/avro.py +0 -0
  24. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/broker.py +0 -0
  25. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/csvfile.py +0 -0
  26. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/elastic.py +0 -0
  27. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/jsonfile.py +0 -0
  28. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/line.py +0 -0
  29. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/mongo.py +0 -0
  30. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/split.py +0 -0
  31. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/splunk.py +0 -0
  32. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/stream.py +0 -0
  33. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/text.py +0 -0
  34. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/adapter/xlsx.py +0 -0
  35. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/base.py +0 -0
  36. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/exceptions.py +0 -0
  37. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/__init__.py +0 -0
  38. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/credential.py +0 -0
  39. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/__init__.py +0 -0
  40. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/ip.py +0 -0
  41. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/ipv4.py +0 -0
  42. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/tcp.py +0 -0
  43. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/fieldtypes/net/udp.py +0 -0
  44. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/jsonpacker.py +0 -0
  45. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/packer.py +0 -0
  46. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/selector.py +0 -0
  47. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/tools/__init__.py +0 -0
  48. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/tools/geoip.py +0 -0
  49. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/tools/rdump.py +0 -0
  50. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/utils.py +0 -0
  51. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow/record/whitelist.py +0 -0
  52. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/dependency_links.txt +0 -0
  53. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/entry_points.txt +0 -0
  54. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/flow.record.egg-info/top_level.txt +0 -0
  55. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/setup.cfg +0 -0
  56. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/__init__.py +0 -0
  57. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/_utils.py +0 -0
  58. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/docs/Makefile +0 -0
  59. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/docs/conf.py +0 -0
  60. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/docs/index.rst +0 -0
  61. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/selector_explain_example.py +0 -0
  62. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/standalone_test.py +0 -0
  63. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_avro.py +0 -0
  64. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_avro_adapter.py +0 -0
  65. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_compiled_selector.py +0 -0
  66. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_csv_adapter.py +0 -0
  67. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_deprecations.py +0 -0
  68. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_fieldtype_ip.py +0 -0
  69. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_fieldtypes.py +0 -0
  70. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_json_packer.py +0 -0
  71. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_json_record_adapter.py +0 -0
  72. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_multi_timestamp.py +0 -0
  73. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_packer.py +0 -0
  74. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_rdump.py +0 -0
  75. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_record.py +0 -0
  76. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_record_adapter.py +0 -0
  77. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_record_descriptor.py +0 -0
  78. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_regression.py +0 -0
  79. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_selector.py +0 -0
  80. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/test_splunk_adapter.py +0 -0
  81. {flow.record-3.15.dev4 → flow.record-3.15.dev6}/tests/utils_inspect.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flow.record
3
- Version: 3.15.dev4
3
+ Version: 3.15.dev6
4
4
  Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
5
5
  Author-email: Dissect Team <dissect@fox-it.com>
6
6
  License: Affero General Public License v3
@@ -32,10 +32,14 @@ Provides-Extra: geoip
32
32
  Requires-Dist: maxminddb; extra == "geoip"
33
33
  Provides-Extra: avro
34
34
  Requires-Dist: fastavro[snappy]; extra == "avro"
35
+ Provides-Extra: duckdb
36
+ Requires-Dist: duckdb; extra == "duckdb"
37
+ Requires-Dist: pytz; extra == "duckdb"
35
38
  Provides-Extra: test
36
- Requires-Dist: lz4; extra == "test"
37
- Requires-Dist: zstandard; extra == "test"
38
- Requires-Dist: fastavro; extra == "test"
39
+ Requires-Dist: flow.record[compression]; extra == "test"
40
+ Requires-Dist: flow.record[avro]; extra == "test"
41
+ Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
42
+ Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
39
43
 
40
44
  # flow.record
41
45
 
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ import duckdb
6
+
7
+ from flow.record.adapter.sqlite import (
8
+ Selector,
9
+ SqliteReader,
10
+ SqliteWriter,
11
+ make_selector,
12
+ )
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ __usage__ = """
17
+ DuckDB adapter
18
+ ---
19
+ Write usage: rdump -w duckdb://[PATH]?batch_size=[BATCH_SIZE]
20
+ Read usage: rdump duckdb://[PATH]?batch_size=[BATCH_SIZE]
21
+ [PATH]: path to DuckDB database file
22
+
23
+ Optional parameters:
24
+ [BATCH_SIZE]: number of records to read or write in a single transaction (default: 1000)
25
+ """
26
+
27
+
28
+ class DuckdbReader(SqliteReader):
29
+ """DuckDB reader, subclasses from SQLite reader."""
30
+
31
+ logger = logger
32
+
33
+ def __init__(self, path: str, *, batch_size: str | int = 1000, selector: Selector | str | None = None, **kwargs):
34
+ self.selector = make_selector(selector)
35
+ self.descriptors_seen = set()
36
+ self.con = duckdb.connect(path)
37
+ self.count = 0
38
+ self.batch_size = int(batch_size)
39
+
40
+
41
+ class DuckdbWriter(SqliteWriter):
42
+ """DuckDB writer, subclasses from SQLite writer."""
43
+
44
+ logger = logger
45
+
46
+ def __init__(self, path: str, *, batch_size: str | int = 1000, **kwargs):
47
+ self.descriptors_seen = set()
48
+ self.con = None
49
+ self.con = duckdb.connect(path)
50
+ self.count = 0
51
+ self.batch_size = int(batch_size)
52
+ self.con.begin()
53
+
54
+ def tx_cycle(self) -> None:
55
+ self.con.commit()
56
+ self.con.begin()
@@ -18,7 +18,7 @@ SQLite adapter
18
18
  ---
19
19
  Write usage: rdump -w sqlite://[PATH]?batch_size=[BATCH_SIZE]
20
20
  Read usage: rdump sqlite://[PATH]?batch_size=[BATCH_SIZE]
21
- [PATH]: path to sqlite database file
21
+ [PATH]: path to SQLite database file
22
22
 
23
23
  Optional parameters:
24
24
  [BATCH_SIZE]: number of records to read or write in a single transaction (default: 1000)
@@ -28,12 +28,12 @@ Optional parameters:
28
28
  FIELD_MAP = {
29
29
  "int": "INTEGER",
30
30
  "uint32": "INTEGER",
31
- "varint": "INTEGER",
31
+ "varint": "BIGINT",
32
32
  "float": "REAL",
33
33
  "boolean": "INTEGER",
34
34
  "bytes": "BLOB",
35
- "filesize": "INTEGER",
36
- "datetime": "TIMESTAMP",
35
+ "filesize": "BIGINT",
36
+ "datetime": "TIMESTAMPTZ",
37
37
  }
38
38
 
39
39
 
@@ -41,12 +41,15 @@ FIELD_MAP = {
41
41
  SQLITE_FIELD_MAP = {
42
42
  "VARCHAR": "string",
43
43
  "INTEGER": "varint",
44
+ "BIGINT": "varint",
44
45
  "BLOB": "bytes",
45
46
  "REAL": "float",
46
47
  "DOUBLE": "float",
47
48
  "BOOLEAN": "boolean",
48
49
  "DATETIME": "datetime",
49
50
  "TIMESTAMP": "datetime",
51
+ "TIMESTAMPTZ": "datetime",
52
+ "TIMESTAMP WITH TIME ZONE": "datetime",
50
53
  }
51
54
 
52
55
 
@@ -58,11 +61,11 @@ def create_descriptor_table(con: sqlite3.Connection, descriptor: RecordDescripto
58
61
  column_defs = []
59
62
  for column_name, fieldset in descriptor.get_all_fields().items():
60
63
  column_type = FIELD_MAP.get(fieldset.typename, "TEXT")
61
- column_defs.append(f" `{column_name}` {column_type}")
64
+ column_defs.append(f' "{column_name}" {column_type}')
62
65
  sql_columns = ",\n".join(column_defs)
63
66
 
64
67
  # Create the descriptor table
65
- sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (\n{sql_columns}\n)"
68
+ sql = f'CREATE TABLE IF NOT EXISTS "{table_name}" (\n{sql_columns}\n)'
66
69
  logger.debug(sql)
67
70
  con.execute(sql)
68
71
 
@@ -72,7 +75,7 @@ def update_descriptor_columns(con: sqlite3.Connection, descriptor: RecordDescrip
72
75
  table_name = descriptor.name
73
76
 
74
77
  # Get existing columns
75
- cursor = con.execute(f"PRAGMA table_info(`{table_name}`)")
78
+ cursor = con.execute(f'PRAGMA table_info("{table_name}")')
76
79
  column_names = set(row[1] for row in cursor.fetchall())
77
80
 
78
81
  # Add missing columns
@@ -81,23 +84,23 @@ def update_descriptor_columns(con: sqlite3.Connection, descriptor: RecordDescrip
81
84
  if column_name in column_names:
82
85
  continue
83
86
  column_type = FIELD_MAP.get(fieldset.typename, "TEXT")
84
- column_defs.append(f" ALTER TABLE `{table_name}` ADD COLUMN `{column_name}` {column_type}")
87
+ column_defs.append(f' ALTER TABLE "{table_name}" ADD COLUMN "{column_name}" {column_type}')
85
88
 
86
89
  # No missing columns
87
90
  if not column_defs:
88
91
  return None
89
92
 
90
93
  # Add the new columns
91
- sql = ";\n".join(column_defs)
92
- con.executescript(sql)
94
+ for col_def in column_defs:
95
+ con.execute(col_def)
93
96
 
94
97
 
95
98
  @lru_cache(maxsize=1000)
96
99
  def prepare_insert_sql(table_name: str, field_names: tuple[str]) -> str:
97
100
  """Return (cached) prepared SQL statement for inserting a record based on table name and field names."""
98
- column_names = ", ".join(f"`{name}`" for name in field_names)
101
+ column_names = ", ".join(f'"{name}"' for name in field_names)
99
102
  value_placeholder = ", ".join(["?"] * len(field_names))
100
- return f"INSERT INTO `{table_name}` ({column_names}) VALUES ({value_placeholder})"
103
+ return f'INSERT INTO "{table_name}" ({column_names}) VALUES ({value_placeholder})'
101
104
 
102
105
 
103
106
  def db_insert_record(con: sqlite3.Connection, record: Record) -> None:
@@ -123,7 +126,11 @@ def db_insert_record(con: sqlite3.Connection, record: Record) -> None:
123
126
 
124
127
 
125
128
  class SqliteReader(AbstractReader):
126
- def __init__(self, path: str, batch_size: str | int = 1000, selector: Selector | str | None = None, **kwargs):
129
+ """SQLite reader."""
130
+
131
+ logger = logger
132
+
133
+ def __init__(self, path: str, *, batch_size: str | int = 1000, selector: Selector | str | None = None, **kwargs):
127
134
  self.selector = make_selector(selector)
128
135
  self.descriptors_seen = set()
129
136
  self.con = sqlite3.connect(path)
@@ -140,7 +147,7 @@ class SqliteReader(AbstractReader):
140
147
 
141
148
  # flow.record is quite strict with what is allowed in fieldnames or decriptor name.
142
149
  # While SQLite is less strict, we need to sanitize the names to make them compatible.
143
- table_name_org = table_name
150
+ table_name_org = table_name.replace('"', '""')
144
151
  table_name = normalize_fieldname(table_name)
145
152
 
146
153
  schema = self.con.execute(
@@ -161,8 +168,8 @@ class SqliteReader(AbstractReader):
161
168
  fnames.append(fname)
162
169
 
163
170
  descriptor_cls = RecordDescriptor(table_name, fields)
164
- table_name_org = table_name_org.replace("`", r"\\\`")
165
- cursor = self.con.execute(f"SELECT * FROM `{table_name_org}`")
171
+ table_name_org = table_name_org.replace('"', '""')
172
+ cursor = self.con.execute(f'SELECT * FROM "{table_name_org}"')
166
173
  while True:
167
174
  rows = cursor.fetchmany(self.batch_size)
168
175
  if not rows:
@@ -186,19 +193,24 @@ class SqliteReader(AbstractReader):
186
193
  def __iter__(self) -> Iterator[Record]:
187
194
  """Iterate over all tables in the database and yield records."""
188
195
  for table_name in self.table_names():
189
- logging.debug("Reading table: %s", table_name)
196
+ self.logger.debug("Reading table: %s", table_name)
190
197
  for record in self.read_table(table_name):
191
198
  if not self.selector or self.selector.match(record):
192
199
  yield record
193
200
 
194
201
 
195
202
  class SqliteWriter(AbstractWriter):
196
- def __init__(self, path: str, batch_size: str | int = 1000, **kwargs):
203
+ """SQLite writer."""
204
+
205
+ logger = logger
206
+
207
+ def __init__(self, path: str, *, batch_size: str | int = 1000, **kwargs):
197
208
  self.descriptors_seen = set()
198
209
  self.con = None
199
- self.con = sqlite3.connect(path)
210
+ self.con = sqlite3.connect(path, isolation_level=None)
200
211
  self.count = 0
201
212
  self.batch_size = int(batch_size)
213
+ self.tx_cycle()
202
214
 
203
215
  def write(self, record: Record) -> None:
204
216
  """Write a record to the database"""
@@ -207,17 +219,23 @@ class SqliteWriter(AbstractWriter):
207
219
  self.descriptors_seen.add(desc)
208
220
  create_descriptor_table(self.con, desc)
209
221
  update_descriptor_columns(self.con, desc)
222
+ self.flush()
210
223
 
211
224
  db_insert_record(self.con, record)
212
225
  self.count += 1
213
226
 
214
227
  # Commit every batch_size records
215
228
  if self.count % self.batch_size == 0:
216
- self.con.commit()
229
+ self.flush()
230
+
231
+ def tx_cycle(self) -> None:
232
+ if self.con.in_transaction:
233
+ self.con.execute("COMMIT")
234
+ self.con.execute("BEGIN")
217
235
 
218
236
  def flush(self) -> None:
219
237
  if self.con:
220
- self.con.commit()
238
+ self.tx_cycle()
221
239
 
222
240
  def close(self) -> None:
223
241
  if self.con:
@@ -256,7 +256,8 @@ class RecordFieldRewriter:
256
256
  self.exclude = exclude or []
257
257
  self.expression = compile(expression, "<string>", "exec") if expression else None
258
258
 
259
- @lru_cache(maxsize=256)
259
+ self.record_descriptor_for_fields = lru_cache(256)(self.record_descriptor_for_fields)
260
+
260
261
  def record_descriptor_for_fields(self, descriptor, fields=None, exclude=None, new_fields=None):
261
262
  if not fields and not exclude and not new_fields:
262
263
  return descriptor
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '3.15.dev4'
16
- __version_tuple__ = version_tuple = (3, 15, 'dev4')
15
+ __version__ = version = '3.15.dev6'
16
+ __version_tuple__ = version_tuple = (3, 15, 'dev6')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flow.record
3
- Version: 3.15.dev4
3
+ Version: 3.15.dev6
4
4
  Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
5
5
  Author-email: Dissect Team <dissect@fox-it.com>
6
6
  License: Affero General Public License v3
@@ -32,10 +32,14 @@ Provides-Extra: geoip
32
32
  Requires-Dist: maxminddb; extra == "geoip"
33
33
  Provides-Extra: avro
34
34
  Requires-Dist: fastavro[snappy]; extra == "avro"
35
+ Provides-Extra: duckdb
36
+ Requires-Dist: duckdb; extra == "duckdb"
37
+ Requires-Dist: pytz; extra == "duckdb"
35
38
  Provides-Extra: test
36
- Requires-Dist: lz4; extra == "test"
37
- Requires-Dist: zstandard; extra == "test"
38
- Requires-Dist: fastavro; extra == "test"
39
+ Requires-Dist: flow.record[compression]; extra == "test"
40
+ Requires-Dist: flow.record[avro]; extra == "test"
41
+ Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
42
+ Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
39
43
 
40
44
  # flow.record
41
45
 
@@ -29,6 +29,7 @@ flow/record/adapter/archive.py
29
29
  flow/record/adapter/avro.py
30
30
  flow/record/adapter/broker.py
31
31
  flow/record/adapter/csvfile.py
32
+ flow/record/adapter/duckdb.py
32
33
  flow/record/adapter/elastic.py
33
34
  flow/record/adapter/jsonfile.py
34
35
  flow/record/adapter/line.py
@@ -71,7 +72,7 @@ tests/test_record_descriptor.py
71
72
  tests/test_regression.py
72
73
  tests/test_selector.py
73
74
  tests/test_splunk_adapter.py
74
- tests/test_sqlite_adapter.py
75
+ tests/test_sqlite_duckdb_adapter.py
75
76
  tests/utils_inspect.py
76
77
  tests/docs/Makefile
77
78
  tests/docs/conf.py
@@ -13,6 +13,10 @@ fastavro[snappy]
13
13
  lz4
14
14
  zstandard
15
15
 
16
+ [duckdb]
17
+ duckdb
18
+ pytz
19
+
16
20
  [elastic]
17
21
  elasticsearch
18
22
 
@@ -20,6 +24,9 @@ elasticsearch
20
24
  maxminddb
21
25
 
22
26
  [test]
23
- lz4
24
- zstandard
25
- fastavro
27
+ flow.record[compression]
28
+ flow.record[avro]
29
+
30
+ [test:platform_python_implementation != "PyPy" and python_version < "3.12"]
31
+ duckdb
32
+ pytz
@@ -49,10 +49,15 @@ geoip = [
49
49
  avro = [
50
50
  "fastavro[snappy]",
51
51
  ]
52
+ duckdb = [
53
+ "duckdb",
54
+ "pytz", # duckdb requires pytz for timezone support
55
+ ]
52
56
  test = [
53
- "lz4",
54
- "zstandard",
55
- "fastavro",
57
+ "flow.record[compression]",
58
+ "flow.record[avro]",
59
+ "duckdb; platform_python_implementation != 'PyPy' and python_version < '3.12'", # duckdb
60
+ "pytz; platform_python_implementation != 'PyPy' and python_version < '3.12'", # duckdb
56
61
  ]
57
62
 
58
63
  [project.scripts]
@@ -1,7 +1,12 @@
1
1
  import sqlite3
2
2
  from datetime import datetime, timezone
3
3
  from pathlib import Path
4
- from typing import Any, Iterator
4
+ from typing import Any, Iterator, NamedTuple
5
+
6
+ try:
7
+ import duckdb
8
+ except ModuleNotFoundError:
9
+ duckdb = None
5
10
 
6
11
  import pytest
7
12
 
@@ -11,6 +16,26 @@ from flow.record.base import normalize_fieldname
11
16
  from flow.record.exceptions import RecordDescriptorError
12
17
 
13
18
 
19
+ class Database(NamedTuple):
20
+ scheme: str
21
+ connector: Any
22
+
23
+
24
+ # We test for sqlite3 and duckdb (if available)
25
+ if duckdb is None:
26
+ databases = [
27
+ Database("sqlite", sqlite3),
28
+ ]
29
+ else:
30
+ databases = [
31
+ Database("sqlite", sqlite3),
32
+ Database("duckdb", duckdb),
33
+ ]
34
+
35
+ # pytest fixture that will run the test for each database in the databases list
36
+ sqlite_duckdb_parametrize = pytest.mark.parametrize("db", databases, ids=[db.scheme for db in databases])
37
+
38
+
14
39
  def generate_records(amount: int) -> Iterator[Record]:
15
40
  """Generates some test records"""
16
41
  TestRecordWithFooBar = RecordDescriptor(
@@ -34,11 +59,12 @@ def generate_records(amount: int) -> Iterator[Record]:
34
59
  "_my_movies",
35
60
  ],
36
61
  )
37
- def test_table_name_sanitization(tmp_path: Path, table_name: str) -> None:
62
+ @sqlite_duckdb_parametrize
63
+ def test_table_name_sanitization(tmp_path: Path, table_name: str, db: Database) -> None:
38
64
  """Ensure that we can read table names that are technically invalid in flow.record."""
39
- db = tmp_path / "records.db"
40
- con = sqlite3.connect(db)
41
- con.execute(f"CREATE TABLE '{table_name}' (title TEXT, year INTEGER, score REAL)")
65
+ db_path = tmp_path / "records.db"
66
+ con = db.connector.connect(str(db_path))
67
+ con.execute(f"CREATE TABLE '{table_name}' (title TEXT, year INTEGER, score DOUBLE)")
42
68
  data = [
43
69
  ("Monty Python Live at the Hollywood Bowl", 1982, 7.9),
44
70
  ("Monty Python's The Meaning of Life", 1983, 7.5),
@@ -49,7 +75,7 @@ def test_table_name_sanitization(tmp_path: Path, table_name: str) -> None:
49
75
  con.close()
50
76
 
51
77
  data_records = []
52
- with RecordReader(f"sqlite://{db}") as reader:
78
+ with RecordReader(f"{db.scheme}://{db_path}") as reader:
53
79
  data_records = [(record.title, record.year, record.score) for record in reader]
54
80
  assert data == data_records
55
81
 
@@ -63,11 +89,12 @@ def test_table_name_sanitization(tmp_path: Path, table_name: str) -> None:
63
89
  "1337_starting_with_number",
64
90
  ],
65
91
  )
66
- def test_field_name_sanitization(tmp_path: Path, field_name: str) -> None:
92
+ @sqlite_duckdb_parametrize
93
+ def test_field_name_sanitization(tmp_path: Path, field_name: str, db: Database) -> None:
67
94
  """Ensure that we can read field names that are technically invalid in flow.record."""
68
- db = tmp_path / "records.db"
69
- con = sqlite3.connect(db)
70
- con.execute(f"CREATE TABLE 'my_table' ('{field_name}' TEXT)")
95
+ db_path = tmp_path / "records.db"
96
+ con = db.connector.connect(str(db_path))
97
+ con.execute(f'CREATE TABLE "my_table" ("{field_name}" TEXT)')
71
98
  data = [
72
99
  ("hello",),
73
100
  ("world",),
@@ -81,7 +108,7 @@ def test_field_name_sanitization(tmp_path: Path, field_name: str) -> None:
81
108
  data_records = []
82
109
  sanitized_field_name = normalize_fieldname(field_name)
83
110
 
84
- with RecordReader(f"sqlite://{db}") as reader:
111
+ with RecordReader(f"{db.scheme}://{db_path}") as reader:
85
112
  data_records = [(getattr(record, sanitized_field_name),) for record in reader]
86
113
  assert data == data_records
87
114
 
@@ -95,20 +122,21 @@ def test_field_name_sanitization(tmp_path: Path, field_name: str) -> None:
95
122
  2000,
96
123
  ],
97
124
  )
98
- def test_write_to_sqlite(tmp_path: Path, count: int) -> None:
125
+ @sqlite_duckdb_parametrize
126
+ def test_write_to_sqlite(tmp_path: Path, count: int, db: Database) -> None:
99
127
  """Tests writing records to a SQLite database."""
100
- db = tmp_path / "records.db"
101
- with RecordWriter(f"sqlite://{db}") as writer:
128
+ db_path = tmp_path / "records.db"
129
+ with RecordWriter(f"{db.scheme}://{db_path}") as writer:
102
130
  for record in generate_records(count):
103
131
  writer.write(record)
104
132
 
105
133
  record_count = 0
106
- with sqlite3.connect(db) as con:
134
+ with db.connector.connect(str(db_path)) as con:
107
135
  cursor = con.execute("SELECT COUNT(*) FROM 'test/record'")
108
136
  record_count = cursor.fetchone()[0]
109
137
 
110
138
  cursor = con.execute("SELECT * FROM 'test/record'")
111
- for index, row in enumerate(cursor):
139
+ for index, row in enumerate(cursor.fetchall()):
112
140
  assert row[0] == f"record{index}"
113
141
  assert row[1] == "bar"
114
142
  assert row[2] == "baz"
@@ -119,18 +147,19 @@ def test_write_to_sqlite(tmp_path: Path, count: int) -> None:
119
147
  assert record_count == count
120
148
 
121
149
 
122
- def test_read_from_sqlite(tmp_path: Path) -> None:
150
+ @sqlite_duckdb_parametrize
151
+ def test_read_from_sqlite(tmp_path: Path, db: Database) -> None:
123
152
  """Tests basic reading from a SQLite database."""
124
153
  # Generate a SQLite database
125
- db = tmp_path / "records.db"
126
- with sqlite3.connect(db) as con:
154
+ db_path = tmp_path / "records.db"
155
+ with db.connector.connect(str(db_path)) as con:
127
156
  con.execute(
128
157
  """
129
158
  CREATE TABLE 'test/record' (
130
159
  name TEXT,
131
160
  data BLOB,
132
- datetime DATETIME,
133
- score REAL
161
+ datetime TIMESTAMPTZ,
162
+ score DOUBLE
134
163
  )
135
164
  """
136
165
  )
@@ -143,7 +172,7 @@ def test_read_from_sqlite(tmp_path: Path) -> None:
143
172
  )
144
173
 
145
174
  # Read the SQLite database using flow.record
146
- with RecordReader(f"sqlite://{db}") as reader:
175
+ with RecordReader(f"{db.scheme}://{db_path}") as reader:
147
176
  for i, record in enumerate(reader, start=1):
148
177
  assert isinstance(record.name, str)
149
178
  assert isinstance(record.datetime, datetime)
@@ -153,12 +182,14 @@ def test_read_from_sqlite(tmp_path: Path) -> None:
153
182
  assert record.name == f"record{i}"
154
183
  assert record.data == f"foobar{i}".encode()
155
184
  assert record.datetime == datetime(2023, 10, i, 13, 37, tzinfo=timezone.utc)
185
+ assert str(record.datetime) == f"2023-10-{i:02d} 13:37:00+00:00"
156
186
  assert record.score == 3.14 + i
157
187
 
158
188
 
159
- def test_write_dynamic_descriptor(tmp_path: Path) -> None:
189
+ @sqlite_duckdb_parametrize
190
+ def test_write_dynamic_descriptor(tmp_path: Path, db: Database) -> None:
160
191
  """Test the ability to write records with different descriptors to the same table."""
161
- db = tmp_path / "records.db"
192
+ db_path = tmp_path / "records.db"
162
193
  TestRecord = RecordDescriptor(
163
194
  "test/dynamic",
164
195
  [
@@ -179,7 +210,7 @@ def test_write_dynamic_descriptor(tmp_path: Path) -> None:
179
210
  )
180
211
 
181
212
  # We should be able to write records with different descriptors to the same table
182
- with RecordWriter(f"sqlite://{db}") as writer:
213
+ with RecordWriter(f"{db.scheme}://{db_path}") as writer:
183
214
  record1 = TestRecord(name="record1", foo="bar", bar="baz")
184
215
  writer.write(record1)
185
216
  record2 = TestRecord_extra(name="record2", foo="bar", bar="baz", extra="extra", extra2="extra2")
@@ -187,7 +218,7 @@ def test_write_dynamic_descriptor(tmp_path: Path) -> None:
187
218
 
188
219
  # The read table should be a combination of both descriptors
189
220
  record_count = 0
190
- with RecordReader(f"sqlite://{db}") as reader:
221
+ with RecordReader(f"{db.scheme}://{db_path}") as reader:
191
222
  for record_count, record in enumerate(reader, start=1):
192
223
  assert record._desc.get_field_tuples() == (
193
224
  ("string", "name"),
@@ -206,14 +237,15 @@ def test_write_dynamic_descriptor(tmp_path: Path) -> None:
206
237
  assert record_count == 2
207
238
 
208
239
 
209
- def test_write_zero_records(tmp_path: Path) -> None:
240
+ @sqlite_duckdb_parametrize
241
+ def test_write_zero_records(tmp_path: Path, db: Database) -> None:
210
242
  """Test writing zero records."""
211
- db = tmp_path / "records.db"
212
- with RecordWriter(f"sqlite://{db}") as writer:
243
+ db_path = tmp_path / "records.db"
244
+ with RecordWriter(f"{db.scheme}://{db_path}") as writer:
213
245
  assert writer
214
246
 
215
247
  # test if it's a valid database
216
- with sqlite3.connect(db) as con:
248
+ with db.connector.connect(str(db_path)) as con:
217
249
  assert con.execute("SELECT * FROM sqlite_master").fetchall() == []
218
250
 
219
251
 
@@ -295,7 +327,7 @@ def test_invalid_field_names_quoting(tmp_path: Path, invalid_field_name: str) ->
295
327
  def test_prepare_insert_sql():
296
328
  table_name = "my_table"
297
329
  field_names = ("name", "age", "email")
298
- expected_sql = "INSERT INTO `my_table` (`name`, `age`, `email`) VALUES (?, ?, ?)"
330
+ expected_sql = 'INSERT INTO "my_table" ("name", "age", "email") VALUES (?, ?, ?)'
299
331
  assert prepare_insert_sql(table_name, field_names) == expected_sql
300
332
 
301
333
 
@@ -308,17 +340,24 @@ def test_prepare_insert_sql():
308
340
  (1000, 0, 1000),
309
341
  ],
310
342
  )
311
- def test_batch_size(tmp_path: Path, batch_size: int, expected_first: int, expected_second: int) -> None:
343
+ @sqlite_duckdb_parametrize
344
+ def test_batch_size(
345
+ tmp_path: Path,
346
+ batch_size: int,
347
+ expected_first: int,
348
+ expected_second: int,
349
+ db: Database,
350
+ ) -> None:
312
351
  """Test that batch_size is respected when writing records."""
313
352
  records = generate_records(batch_size + 100)
314
353
  db_path = tmp_path / "records.db"
315
- with RecordWriter(f"sqlite://{db_path}?batch_size={batch_size}") as writer:
354
+ with RecordWriter(f"{db.scheme}://{db_path}?batch_size={batch_size}") as writer:
316
355
  # write a single record, should not be flushed yet if batch_size > 1
317
356
  writer.write(next(records))
318
357
 
319
358
  # test count of records in table (no flush yet if batch_size > 1)
320
- with sqlite3.connect(db_path) as con:
321
- x = con.execute("select count(*) from `test/record`")
359
+ with db.connector.connect(str(db_path)) as con:
360
+ x = con.execute('SELECT COUNT(*) FROM "test/record"')
322
361
  assert x.fetchone()[0] is expected_first
323
362
 
324
363
  # write at least batch_size records, should be flushed due to batch_size
@@ -326,23 +365,24 @@ def test_batch_size(tmp_path: Path, batch_size: int, expected_first: int, expect
326
365
  writer.write(next(records))
327
366
 
328
367
  # test count of records in table after flush
329
- with sqlite3.connect(db_path) as con:
330
- x = con.execute("select count(*) from `test/record`")
368
+ with db.connector.connect(str(db_path)) as con:
369
+ x = con.execute('SELECT COUNT(*) FROM "test/record"')
331
370
  assert x.fetchone()[0] == expected_second
332
371
 
333
372
 
334
- def test_selector(tmp_path: Path) -> None:
373
+ @sqlite_duckdb_parametrize
374
+ def test_selector(tmp_path: Path, db: Database) -> None:
335
375
  """Test selector when reading records."""
336
376
  db_path = tmp_path / "records.db"
337
- with RecordWriter(f"sqlite://{db_path}") as writer:
377
+ with RecordWriter(f"{db.scheme}://{db_path}") as writer:
338
378
  for record in generate_records(10):
339
379
  writer.write(record)
340
380
 
341
- with RecordReader(f"sqlite://{db_path}", selector="r.name == 'record5'") as reader:
381
+ with RecordReader(f"{db.scheme}://{db_path}", selector="r.name == 'record5'") as reader:
342
382
  records = list(reader)
343
383
  assert len(records) == 1
344
384
  assert records[0].name == "record5"
345
385
 
346
- with RecordReader(f"sqlite://{db_path}", selector="r.name == 'record12345'") as reader:
386
+ with RecordReader(f"{db.scheme}://{db_path}", selector="r.name == 'record12345'") as reader:
347
387
  records = list(reader)
348
388
  assert len(records) == 0
@@ -4,7 +4,7 @@ envlist = lint, py3, pypy3
4
4
  # requires if they are not available on the host system. This requires the
5
5
  # locally installed tox to have a minimum version 3.3.0. This means the names
6
6
  # of the configuration options are still according to the tox 3.x syntax.
7
- minversion = 4.2.4
7
+ minversion = 4.11.4
8
8
  # This version of virtualenv will install setuptools version 65.5.0 and pip
9
9
  # 22.3. These versions fully support python projects defined only through a
10
10
  # pyproject.toml file (PEP-517/PEP-518/PEP-621)
File without changes