flow.record 3.14.dev4__tar.gz → 3.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {flow.record-3.14.dev4/flow.record.egg-info → flow_record-3.15}/PKG-INFO +10 -4
  2. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/__init__.py +4 -0
  3. flow_record-3.15/flow/record/adapter/duckdb.py +56 -0
  4. flow_record-3.15/flow/record/adapter/line.py +81 -0
  5. flow_record-3.15/flow/record/adapter/splunk.py +282 -0
  6. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/sqlite.py +39 -21
  7. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/base.py +67 -4
  8. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/fieldtypes/__init__.py +145 -24
  9. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/jsonpacker.py +5 -0
  10. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/selector.py +0 -3
  11. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/stream.py +2 -1
  12. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/tools/rdump.py +13 -1
  13. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/version.py +2 -2
  14. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/whitelist.py +1 -0
  15. {flow.record-3.14.dev4 → flow_record-3.15/flow.record.egg-info}/PKG-INFO +10 -4
  16. {flow.record-3.14.dev4 → flow_record-3.15}/flow.record.egg-info/SOURCES.txt +2 -1
  17. {flow.record-3.14.dev4 → flow_record-3.15}/flow.record.egg-info/requires.txt +13 -3
  18. {flow.record-3.14.dev4 → flow_record-3.15}/pyproject.toml +12 -5
  19. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_fieldtypes.py +168 -15
  20. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_rdump.py +64 -10
  21. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_record.py +144 -1
  22. flow_record-3.15/tests/test_splunk_adapter.py +403 -0
  23. flow.record-3.14.dev4/tests/test_sqlite_adapter.py → flow_record-3.15/tests/test_sqlite_duckdb_adapter.py +81 -41
  24. {flow.record-3.14.dev4 → flow_record-3.15}/tox.ini +1 -1
  25. flow.record-3.14.dev4/flow/record/adapter/line.py +0 -44
  26. flow.record-3.14.dev4/flow/record/adapter/splunk.py +0 -90
  27. flow.record-3.14.dev4/tests/test_splunk_adapter.py +0 -104
  28. {flow.record-3.14.dev4 → flow_record-3.15}/COPYRIGHT +0 -0
  29. {flow.record-3.14.dev4 → flow_record-3.15}/LICENSE +0 -0
  30. {flow.record-3.14.dev4 → flow_record-3.15}/MANIFEST.in +0 -0
  31. {flow.record-3.14.dev4 → flow_record-3.15}/README.md +0 -0
  32. {flow.record-3.14.dev4 → flow_record-3.15}/examples/filesystem.py +0 -0
  33. {flow.record-3.14.dev4 → flow_record-3.15}/examples/passivedns.py +0 -0
  34. {flow.record-3.14.dev4 → flow_record-3.15}/examples/records.json +0 -0
  35. {flow.record-3.14.dev4 → flow_record-3.15}/examples/tcpconn.py +0 -0
  36. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/__init__.py +0 -0
  37. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/archive.py +0 -0
  38. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/avro.py +0 -0
  39. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/broker.py +0 -0
  40. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/csvfile.py +0 -0
  41. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/elastic.py +0 -0
  42. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/jsonfile.py +0 -0
  43. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/mongo.py +0 -0
  44. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/split.py +0 -0
  45. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/stream.py +0 -0
  46. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/text.py +0 -0
  47. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/adapter/xlsx.py +0 -0
  48. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/exceptions.py +0 -0
  49. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/fieldtypes/credential.py +0 -0
  50. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/fieldtypes/net/__init__.py +0 -0
  51. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/fieldtypes/net/ip.py +0 -0
  52. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/fieldtypes/net/ipv4.py +0 -0
  53. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/fieldtypes/net/tcp.py +0 -0
  54. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/fieldtypes/net/udp.py +0 -0
  55. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/packer.py +0 -0
  56. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/tools/__init__.py +0 -0
  57. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/tools/geoip.py +0 -0
  58. {flow.record-3.14.dev4 → flow_record-3.15}/flow/record/utils.py +0 -0
  59. {flow.record-3.14.dev4 → flow_record-3.15}/flow.record.egg-info/dependency_links.txt +0 -0
  60. {flow.record-3.14.dev4 → flow_record-3.15}/flow.record.egg-info/entry_points.txt +0 -0
  61. {flow.record-3.14.dev4 → flow_record-3.15}/flow.record.egg-info/top_level.txt +0 -0
  62. {flow.record-3.14.dev4 → flow_record-3.15}/setup.cfg +0 -0
  63. {flow.record-3.14.dev4 → flow_record-3.15}/tests/__init__.py +0 -0
  64. {flow.record-3.14.dev4 → flow_record-3.15}/tests/_utils.py +0 -0
  65. {flow.record-3.14.dev4 → flow_record-3.15}/tests/docs/Makefile +0 -0
  66. {flow.record-3.14.dev4 → flow_record-3.15}/tests/docs/conf.py +0 -0
  67. {flow.record-3.14.dev4 → flow_record-3.15}/tests/docs/index.rst +0 -0
  68. {flow.record-3.14.dev4 → flow_record-3.15}/tests/selector_explain_example.py +0 -0
  69. {flow.record-3.14.dev4 → flow_record-3.15}/tests/standalone_test.py +0 -0
  70. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_avro.py +0 -0
  71. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_avro_adapter.py +0 -0
  72. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_compiled_selector.py +0 -0
  73. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_csv_adapter.py +0 -0
  74. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_deprecations.py +0 -0
  75. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_fieldtype_ip.py +0 -0
  76. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_json_packer.py +0 -0
  77. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_json_record_adapter.py +0 -0
  78. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_multi_timestamp.py +0 -0
  79. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_packer.py +0 -0
  80. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_record_adapter.py +0 -0
  81. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_record_descriptor.py +0 -0
  82. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_regression.py +0 -0
  83. {flow.record-3.14.dev4 → flow_record-3.15}/tests/test_selector.py +0 -0
  84. {flow.record-3.14.dev4 → flow_record-3.15}/tests/utils_inspect.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flow.record
3
- Version: 3.14.dev4
3
+ Version: 3.15
4
4
  Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
5
5
  Author-email: Dissect Team <dissect@fox-it.com>
6
6
  License: Affero General Public License v3
@@ -32,10 +32,16 @@ Provides-Extra: geoip
32
32
  Requires-Dist: maxminddb; extra == "geoip"
33
33
  Provides-Extra: avro
34
34
  Requires-Dist: fastavro[snappy]; extra == "avro"
35
+ Provides-Extra: duckdb
36
+ Requires-Dist: duckdb; extra == "duckdb"
37
+ Requires-Dist: pytz; extra == "duckdb"
38
+ Provides-Extra: splunk
39
+ Requires-Dist: httpx; extra == "splunk"
35
40
  Provides-Extra: test
36
- Requires-Dist: lz4; extra == "test"
37
- Requires-Dist: zstandard; extra == "test"
38
- Requires-Dist: fastavro; extra == "test"
41
+ Requires-Dist: flow.record[compression]; extra == "test"
42
+ Requires-Dist: flow.record[avro]; extra == "test"
43
+ Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
44
+ Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
39
45
 
40
46
  # flow.record
41
47
 
@@ -2,6 +2,7 @@ import gzip
2
2
  import os
3
3
 
4
4
  from flow.record.base import (
5
+ IGNORE_FIELDS_FOR_COMPARISON,
5
6
  RECORD_VERSION,
6
7
  RECORDSTREAM_MAGIC,
7
8
  DynamicDescriptor,
@@ -20,6 +21,7 @@ from flow.record.base import (
20
21
  open_path,
21
22
  open_path_or_stream,
22
23
  open_stream,
24
+ set_ignored_fields_for_comparison,
23
25
  stream,
24
26
  )
25
27
  from flow.record.jsonpacker import JsonRecordPacker
@@ -35,6 +37,7 @@ from flow.record.stream import (
35
37
  )
36
38
 
37
39
  __all__ = [
40
+ "IGNORE_FIELDS_FOR_COMPARISON",
38
41
  "RECORD_VERSION",
39
42
  "RECORDSTREAM_MAGIC",
40
43
  "FieldType",
@@ -54,6 +57,7 @@ __all__ = [
54
57
  "open_path_or_stream",
55
58
  "open_path",
56
59
  "open_stream",
60
+ "set_ignored_fields_for_comparison",
57
61
  "stream",
58
62
  "dynamic_fieldtype",
59
63
  "DynamicDescriptor",
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ import duckdb
6
+
7
+ from flow.record.adapter.sqlite import (
8
+ Selector,
9
+ SqliteReader,
10
+ SqliteWriter,
11
+ make_selector,
12
+ )
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ __usage__ = """
17
+ DuckDB adapter
18
+ ---
19
+ Write usage: rdump -w duckdb://[PATH]?batch_size=[BATCH_SIZE]
20
+ Read usage: rdump duckdb://[PATH]?batch_size=[BATCH_SIZE]
21
+ [PATH]: path to DuckDB database file
22
+
23
+ Optional parameters:
24
+ [BATCH_SIZE]: number of records to read or write in a single transaction (default: 1000)
25
+ """
26
+
27
+
28
+ class DuckdbReader(SqliteReader):
29
+ """DuckDB reader, subclasses from SQLite reader."""
30
+
31
+ logger = logger
32
+
33
+ def __init__(self, path: str, *, batch_size: str | int = 1000, selector: Selector | str | None = None, **kwargs):
34
+ self.selector = make_selector(selector)
35
+ self.descriptors_seen = set()
36
+ self.con = duckdb.connect(path)
37
+ self.count = 0
38
+ self.batch_size = int(batch_size)
39
+
40
+
41
+ class DuckdbWriter(SqliteWriter):
42
+ """DuckDB writer, subclasses from SQLite writer."""
43
+
44
+ logger = logger
45
+
46
+ def __init__(self, path: str, *, batch_size: str | int = 1000, **kwargs):
47
+ self.descriptors_seen = set()
48
+ self.con = None
49
+ self.con = duckdb.connect(path)
50
+ self.count = 0
51
+ self.batch_size = int(batch_size)
52
+ self.con.begin()
53
+
54
+ def tx_cycle(self) -> None:
55
+ self.con.commit()
56
+ self.con.begin()
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+
5
+ from flow.record import Record, RecordDescriptor, open_path_or_stream
6
+ from flow.record.adapter import AbstractWriter
7
+ from flow.record.utils import is_stdout
8
+
9
+ __usage__ = """
10
+ Line output format adapter (writer only)
11
+ ---
12
+ Write usage: rdump -w line://[PATH]?verbose=[VERBOSE]
13
+ [PATH]: path to file. Leave empty or "-" to output to stdout
14
+
15
+ Optional arguments:
16
+ [VERBOSE]: Also show fieldtype in line output (default: False)
17
+ """
18
+
19
+
20
+ @lru_cache(maxsize=1024)
21
+ def field_types_for_record_descriptor(desc: RecordDescriptor) -> dict[str, str]:
22
+ """Return dictionary of fieldname -> fieldtype for given RecordDescriptor.
23
+
24
+ Args:
25
+ desc: RecordDescriptor to get fieldtypes for
26
+ Returns:
27
+ Dictionary of fieldname -> fieldtype
28
+ """
29
+ return {fname: fieldset.typename for fname, fieldset in desc.get_all_fields().items()}
30
+
31
+
32
+ class LineWriter(AbstractWriter):
33
+ """Prints all fields and values of the Record on a separate line."""
34
+
35
+ fp = None
36
+
37
+ def __init__(
38
+ self,
39
+ path: str,
40
+ *,
41
+ fields: list[str] | str | None = None,
42
+ exclude: list[str] | str | None = None,
43
+ verbose: bool = False,
44
+ **kwargs,
45
+ ):
46
+ self.fp = open_path_or_stream(path, "wb")
47
+ self.count = 0
48
+ self.fields = fields
49
+ self.exclude = exclude
50
+ self.verbose = verbose
51
+ if isinstance(self.fields, str):
52
+ self.fields = self.fields.split(",")
53
+ if isinstance(self.exclude, str):
54
+ self.exclude = self.exclude.split(",")
55
+
56
+ def write(self, rec: Record) -> None:
57
+ rdict = rec._asdict(fields=self.fields, exclude=self.exclude)
58
+ rdict_types = field_types_for_record_descriptor(rec._desc) if self.verbose else None
59
+
60
+ self.count += 1
61
+ self.fp.write(f"--[ RECORD {self.count} ]--\n".encode())
62
+ if rdict:
63
+ if rdict_types:
64
+ # also account for extra characters for fieldtype and whitespace + parenthesis
65
+ width = max(len(k + rdict_types[k]) for k in rdict) + 3
66
+ else:
67
+ width = max(len(k) for k in rdict)
68
+ fmt = "{{:>{width}}} = {{}}\n".format(width=width)
69
+ for key, value in rdict.items():
70
+ if rdict_types:
71
+ key = f"{key} ({rdict_types[key]})"
72
+ self.fp.write(fmt.format(key, value).encode())
73
+
74
+ def flush(self) -> None:
75
+ if self.fp:
76
+ self.fp.flush()
77
+
78
+ def close(self) -> None:
79
+ if self.fp and not is_stdout(self.fp):
80
+ self.fp.close()
81
+ self.fp = None
@@ -0,0 +1,282 @@
1
+ import json
2
+ import logging
3
+ import socket
4
+ import uuid
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Optional
8
+ from urllib.parse import urlparse
9
+
10
+ try:
11
+ import httpx
12
+
13
+ HAS_HTTPX = True
14
+ except ImportError:
15
+ HAS_HTTPX = False
16
+
17
+ from flow.record.adapter import AbstractReader, AbstractWriter
18
+ from flow.record.base import Record
19
+ from flow.record.jsonpacker import JsonRecordPacker
20
+ from flow.record.utils import to_base64, to_bytes, to_str
21
+
22
+ __usage__ = """
23
+ Splunk output adapter (writer only)
24
+ ---
25
+ Write usage: rdump -w splunk+[PROTOCOL]://[IP]:[PORT]?tag=[TAG]&token=[TOKEN]&sourcetype=[SOURCETYPE]
26
+ [PROTOCOL]: Protocol to use for forwarding data. Can be tcp, http or https, defaults to tcp if omitted.
27
+ [IP]:[PORT]: ip and port to a splunk instance
28
+ [TAG]: optional value to add as "rdtag" output field when writing
29
+ [TOKEN]: Authentication token for sending data over HTTP(S)
30
+ [SOURCETYPE]: Set sourcetype of data. Defaults to records, but can also be set to JSON.
31
+ [SSL_VERIFY]: Whether to verify the server certificate when sending data over HTTP(S). Defaults to True.
32
+ """
33
+
34
+ log = logging.getLogger(__package__)
35
+
36
+ # Amount of records to bundle into a single request when sending data over HTTP(S).
37
+ RECORD_BUFFER_LIMIT = 20
38
+
39
+ # https://docs.splunk.com/Documentation/Splunk/7.3.1/Data/Configureindex-timefieldextraction
40
+ RESERVED_SPLUNK_FIELDS = [
41
+ "_indextime",
42
+ "_time",
43
+ "index",
44
+ "punct",
45
+ "source",
46
+ "sourcetype",
47
+ "tag",
48
+ "type",
49
+ ]
50
+
51
+ RESERVED_RECORD_FIELDS = ["_classification", "_generated", "_source"]
52
+
53
+ PREFIX_WITH_RD = set(RESERVED_SPLUNK_FIELDS + RESERVED_RECORD_FIELDS)
54
+
55
+
56
+ class Protocol(Enum):
57
+ HTTP = "http"
58
+ HTTPS = "https"
59
+ TCP = "tcp"
60
+
61
+
62
+ class SourceType(Enum):
63
+ JSON = "json"
64
+ RECORDS = "records"
65
+
66
+
67
+ def splunkify_key_value(record: Record, tag: Optional[str] = None) -> str:
68
+ ret = []
69
+
70
+ ret.append(f'rdtype="{record._desc.name}"')
71
+
72
+ if tag is None:
73
+ ret.append("rdtag=None")
74
+ else:
75
+ ret.append(f'rdtag="{tag}"')
76
+
77
+ for field in record._desc.get_all_fields():
78
+ # Omit the _version field as the Splunk adapter has no reader support for deserialising records back.
79
+ if field == "_version":
80
+ continue
81
+
82
+ val = getattr(record, field)
83
+
84
+ if field in PREFIX_WITH_RD:
85
+ field = f"rd_{field}"
86
+
87
+ if val is None:
88
+ ret.append(f"{field}=None")
89
+ else:
90
+ val = to_base64(val) if isinstance(val, bytes) else to_str(val)
91
+ val = val.replace("\\", "\\\\").replace('"', '\\"')
92
+ ret.append(f'{field}="{val}"')
93
+
94
+ return " ".join(ret)
95
+
96
+
97
+ def splunkify_json(packer: JsonRecordPacker, record: Record, tag: Optional[str] = None) -> str:
98
+ ret = {}
99
+
100
+ indexer_fields = [
101
+ ("host", "host"),
102
+ ("host", "hostname"),
103
+ ("time", "ts"),
104
+ ]
105
+
106
+ # When converting a record to json text for splunk, we distinguish between the 'event' (containing the data) and a
107
+ # few other fields that are splunk-specific for indexing. We add those 'indexer_fields' to the return object first.
108
+ for splunk_name, field_name in indexer_fields:
109
+ if hasattr(record, field_name):
110
+ val = getattr(record, field_name)
111
+ if val:
112
+ if isinstance(val, datetime):
113
+ # Convert datetime objects to epoch timestamp for reserved fields.
114
+ ret[splunk_name] = val.timestamp()
115
+ continue
116
+ ret[splunk_name] = to_str(val)
117
+
118
+ record_as_dict = packer.pack_obj(record)
119
+
120
+ # Omit the _version field as the Splunk adapter has no reader support for deserialising records back.
121
+ del record_as_dict["_version"]
122
+
123
+ # These fields end up in the 'event', but we have a few reserved field names. If those field names are in the
124
+ # record, we prefix them with 'rd_' (short for record descriptor)
125
+ for field in PREFIX_WITH_RD:
126
+ if field not in record_as_dict:
127
+ continue
128
+ new_field = f"rd_{field}"
129
+
130
+ record_as_dict[new_field] = record_as_dict[field]
131
+ del record_as_dict[field]
132
+
133
+ # Almost done, just have to add the tag and the type (i.e the record descriptor's name) to the event.
134
+ record_as_dict["rdtag"] = tag
135
+
136
+ # Yes.
137
+ record_as_dict["rdtype"] = record._desc.name
138
+
139
+ ret["event"] = record_as_dict
140
+ return json.dumps(ret, default=packer.pack_obj)
141
+
142
+
143
+ class SplunkWriter(AbstractWriter):
144
+ sock = None
145
+ session = None
146
+
147
+ def __init__(
148
+ self,
149
+ uri: str,
150
+ tag: Optional[str] = None,
151
+ token: Optional[str] = None,
152
+ sourcetype: Optional[str] = None,
153
+ ssl_verify: bool = True,
154
+ **kwargs,
155
+ ):
156
+ # If the writer is initiated without a protocol, we assume we will be writing over tcp
157
+ if "://" not in uri:
158
+ uri = f"tcp://{uri}"
159
+
160
+ if sourcetype is None:
161
+ log.warning("No sourcetype provided, assuming 'records' sourcetype")
162
+ sourcetype = SourceType.RECORDS
163
+
164
+ parsed_url = urlparse(uri)
165
+ url_scheme = parsed_url.scheme.lower()
166
+
167
+ self.sourcetype = SourceType(sourcetype)
168
+ self.protocol = Protocol(url_scheme)
169
+
170
+ if self.protocol == Protocol.TCP and self.sourcetype != SourceType.RECORDS:
171
+ raise ValueError("For sending data to Splunk over TCP, only the 'records' sourcetype is allowed")
172
+
173
+ self.host = parsed_url.hostname
174
+ self.port = parsed_url.port
175
+ self.tag = tag
176
+ self.record_buffer = []
177
+ self._warned = False
178
+ self.packer = None
179
+
180
+ if self.sourcetype == SourceType.JSON:
181
+ self.packer = JsonRecordPacker(indent=4, pack_descriptors=False)
182
+
183
+ if self.protocol == Protocol.TCP:
184
+ self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM, socket.SOL_TCP)
185
+ self.sock.connect((self.host, self.port))
186
+ self._send = self._send_tcp
187
+ elif self.protocol in (Protocol.HTTP, Protocol.HTTPS):
188
+ if not HAS_HTTPX:
189
+ raise ImportError("The httpx library is required for sending data over HTTP(S)")
190
+
191
+ scheme = self.protocol.value
192
+ self.token = token
193
+ if not self.token:
194
+ raise ValueError("An authorization token is required for the HTTP collector")
195
+ if not self.token.startswith("Splunk "):
196
+ self.token = f"Splunk {self.token}"
197
+
198
+ # Assume verify=True unless specified otherwise.
199
+ self.verify = str(ssl_verify).lower() not in ("0", "false")
200
+ if not self.verify:
201
+ log.warning("Certificate verification is disabled")
202
+
203
+ endpoint = "event" if self.sourcetype != SourceType.RECORDS else "raw"
204
+ port = f":{self.port}" if self.port else ""
205
+ self.url = f"{scheme}://{self.host}{port}/services/collector/{endpoint}?auto_extract_timestamp=true"
206
+
207
+ self.headers = {
208
+ "Authorization": self.token,
209
+ # A randomized value so that Splunk can loadbalance between different incoming datastreams
210
+ "X-Splunk-Request-Channel": str(uuid.uuid4()),
211
+ }
212
+
213
+ self.session = httpx.Client(verify=self.verify, headers=self.headers)
214
+
215
+ self._send = self._send_http
216
+
217
+ def _cache_records_for_http(self, data: Optional[bytes] = None, flush: bool = False) -> Optional[bytes]:
218
+ # It's possible to call this function without any data, purely to flush. Hence this check.
219
+ if data:
220
+ self.record_buffer.append(data)
221
+ if len(self.record_buffer) < RECORD_BUFFER_LIMIT and not flush:
222
+ # Buffer limit not exceeded yet, so we do not return a buffer yet, unless buffer is explicitly flushed.
223
+ return
224
+ buf = b"".join(self.record_buffer)
225
+ if not buf:
226
+ return
227
+
228
+ # We're going to be returning a buffer for the writer to send, so we can clear the internal record buffer.
229
+ self.record_buffer.clear()
230
+ return buf
231
+
232
+ def _send(self, data: bytes) -> None:
233
+ raise RuntimeError("This method should be overridden at runtime")
234
+
235
+ def _send_http(self, data: Optional[bytes] = None, flush: bool = False) -> None:
236
+ buf = self._cache_records_for_http(data, flush)
237
+ if not buf:
238
+ return
239
+ response = self.session.post(self.url, data=buf)
240
+ if response.status_code != 200:
241
+ raise ConnectionError(f"{response.text} ({response.status_code})")
242
+
243
+ def _send_tcp(self, data: bytes) -> None:
244
+ self.sock.sendall(data)
245
+
246
+ def write(self, record: Record) -> None:
247
+ if not self._warned and "rdtag" in record._desc.fields:
248
+ self._warned = True
249
+ log.warning(
250
+ "Record has 'rdtag' field which conflicts with the Splunk adapter -- "
251
+ "Splunk output will have duplicate 'rdtag' fields",
252
+ )
253
+
254
+ if self.sourcetype == SourceType.RECORDS:
255
+ rec = splunkify_key_value(record, self.tag)
256
+ else:
257
+ rec = splunkify_json(self.packer, record, self.tag)
258
+
259
+ # Trail with a newline for line breaking.
260
+ data = to_bytes(rec) + b"\n"
261
+
262
+ self._send(data)
263
+
264
+ def flush(self) -> None:
265
+ if self.protocol in [Protocol.HTTP, Protocol.HTTPS]:
266
+ self._send_http(flush=True)
267
+
268
+ def close(self) -> None:
269
+ # For TCP
270
+ if self.sock:
271
+ self.sock.close()
272
+ self.sock = None
273
+
274
+ if self.session:
275
+ self.flush()
276
+ self.session.close()
277
+ self.session = None
278
+
279
+
280
+ class SplunkReader(AbstractReader):
281
+ def __init__(self, path, selector=None, **kwargs):
282
+ raise NotImplementedError()
@@ -18,7 +18,7 @@ SQLite adapter
18
18
  ---
19
19
  Write usage: rdump -w sqlite://[PATH]?batch_size=[BATCH_SIZE]
20
20
  Read usage: rdump sqlite://[PATH]?batch_size=[BATCH_SIZE]
21
- [PATH]: path to sqlite database file
21
+ [PATH]: path to SQLite database file
22
22
 
23
23
  Optional parameters:
24
24
  [BATCH_SIZE]: number of records to read or write in a single transaction (default: 1000)
@@ -28,12 +28,12 @@ Optional parameters:
28
28
  FIELD_MAP = {
29
29
  "int": "INTEGER",
30
30
  "uint32": "INTEGER",
31
- "varint": "INTEGER",
31
+ "varint": "BIGINT",
32
32
  "float": "REAL",
33
33
  "boolean": "INTEGER",
34
34
  "bytes": "BLOB",
35
- "filesize": "INTEGER",
36
- "datetime": "TIMESTAMP",
35
+ "filesize": "BIGINT",
36
+ "datetime": "TIMESTAMPTZ",
37
37
  }
38
38
 
39
39
 
@@ -41,12 +41,15 @@ FIELD_MAP = {
41
41
  SQLITE_FIELD_MAP = {
42
42
  "VARCHAR": "string",
43
43
  "INTEGER": "varint",
44
+ "BIGINT": "varint",
44
45
  "BLOB": "bytes",
45
46
  "REAL": "float",
46
47
  "DOUBLE": "float",
47
48
  "BOOLEAN": "boolean",
48
49
  "DATETIME": "datetime",
49
50
  "TIMESTAMP": "datetime",
51
+ "TIMESTAMPTZ": "datetime",
52
+ "TIMESTAMP WITH TIME ZONE": "datetime",
50
53
  }
51
54
 
52
55
 
@@ -58,11 +61,11 @@ def create_descriptor_table(con: sqlite3.Connection, descriptor: RecordDescripto
58
61
  column_defs = []
59
62
  for column_name, fieldset in descriptor.get_all_fields().items():
60
63
  column_type = FIELD_MAP.get(fieldset.typename, "TEXT")
61
- column_defs.append(f" `{column_name}` {column_type}")
64
+ column_defs.append(f' "{column_name}" {column_type}')
62
65
  sql_columns = ",\n".join(column_defs)
63
66
 
64
67
  # Create the descriptor table
65
- sql = f"CREATE TABLE IF NOT EXISTS `{table_name}` (\n{sql_columns}\n)"
68
+ sql = f'CREATE TABLE IF NOT EXISTS "{table_name}" (\n{sql_columns}\n)'
66
69
  logger.debug(sql)
67
70
  con.execute(sql)
68
71
 
@@ -72,7 +75,7 @@ def update_descriptor_columns(con: sqlite3.Connection, descriptor: RecordDescrip
72
75
  table_name = descriptor.name
73
76
 
74
77
  # Get existing columns
75
- cursor = con.execute(f"PRAGMA table_info(`{table_name}`)")
78
+ cursor = con.execute(f'PRAGMA table_info("{table_name}")')
76
79
  column_names = set(row[1] for row in cursor.fetchall())
77
80
 
78
81
  # Add missing columns
@@ -81,23 +84,23 @@ def update_descriptor_columns(con: sqlite3.Connection, descriptor: RecordDescrip
81
84
  if column_name in column_names:
82
85
  continue
83
86
  column_type = FIELD_MAP.get(fieldset.typename, "TEXT")
84
- column_defs.append(f" ALTER TABLE `{table_name}` ADD COLUMN `{column_name}` {column_type}")
87
+ column_defs.append(f' ALTER TABLE "{table_name}" ADD COLUMN "{column_name}" {column_type}')
85
88
 
86
89
  # No missing columns
87
90
  if not column_defs:
88
91
  return None
89
92
 
90
93
  # Add the new columns
91
- sql = ";\n".join(column_defs)
92
- con.executescript(sql)
94
+ for col_def in column_defs:
95
+ con.execute(col_def)
93
96
 
94
97
 
95
98
  @lru_cache(maxsize=1000)
96
99
  def prepare_insert_sql(table_name: str, field_names: tuple[str]) -> str:
97
100
  """Return (cached) prepared SQL statement for inserting a record based on table name and field names."""
98
- column_names = ", ".join(f"`{name}`" for name in field_names)
101
+ column_names = ", ".join(f'"{name}"' for name in field_names)
99
102
  value_placeholder = ", ".join(["?"] * len(field_names))
100
- return f"INSERT INTO `{table_name}` ({column_names}) VALUES ({value_placeholder})"
103
+ return f'INSERT INTO "{table_name}" ({column_names}) VALUES ({value_placeholder})'
101
104
 
102
105
 
103
106
  def db_insert_record(con: sqlite3.Connection, record: Record) -> None:
@@ -123,7 +126,11 @@ def db_insert_record(con: sqlite3.Connection, record: Record) -> None:
123
126
 
124
127
 
125
128
  class SqliteReader(AbstractReader):
126
- def __init__(self, path: str, batch_size: str | int = 1000, selector: Selector | str | None = None, **kwargs):
129
+ """SQLite reader."""
130
+
131
+ logger = logger
132
+
133
+ def __init__(self, path: str, *, batch_size: str | int = 1000, selector: Selector | str | None = None, **kwargs):
127
134
  self.selector = make_selector(selector)
128
135
  self.descriptors_seen = set()
129
136
  self.con = sqlite3.connect(path)
@@ -140,7 +147,7 @@ class SqliteReader(AbstractReader):
140
147
 
141
148
  # flow.record is quite strict with what is allowed in fieldnames or decriptor name.
142
149
  # While SQLite is less strict, we need to sanitize the names to make them compatible.
143
- table_name_org = table_name
150
+ table_name_org = table_name.replace('"', '""')
144
151
  table_name = normalize_fieldname(table_name)
145
152
 
146
153
  schema = self.con.execute(
@@ -161,8 +168,8 @@ class SqliteReader(AbstractReader):
161
168
  fnames.append(fname)
162
169
 
163
170
  descriptor_cls = RecordDescriptor(table_name, fields)
164
- table_name_org = table_name_org.replace("`", r"\\\`")
165
- cursor = self.con.execute(f"SELECT * FROM `{table_name_org}`")
171
+ table_name_org = table_name_org.replace('"', '""')
172
+ cursor = self.con.execute(f'SELECT * FROM "{table_name_org}"')
166
173
  while True:
167
174
  rows = cursor.fetchmany(self.batch_size)
168
175
  if not rows:
@@ -186,19 +193,24 @@ class SqliteReader(AbstractReader):
186
193
  def __iter__(self) -> Iterator[Record]:
187
194
  """Iterate over all tables in the database and yield records."""
188
195
  for table_name in self.table_names():
189
- logging.debug("Reading table: %s", table_name)
196
+ self.logger.debug("Reading table: %s", table_name)
190
197
  for record in self.read_table(table_name):
191
198
  if not self.selector or self.selector.match(record):
192
199
  yield record
193
200
 
194
201
 
195
202
  class SqliteWriter(AbstractWriter):
196
- def __init__(self, path: str, batch_size: str | int = 1000, **kwargs):
203
+ """SQLite writer."""
204
+
205
+ logger = logger
206
+
207
+ def __init__(self, path: str, *, batch_size: str | int = 1000, **kwargs):
197
208
  self.descriptors_seen = set()
198
209
  self.con = None
199
- self.con = sqlite3.connect(path)
210
+ self.con = sqlite3.connect(path, isolation_level=None)
200
211
  self.count = 0
201
212
  self.batch_size = int(batch_size)
213
+ self.tx_cycle()
202
214
 
203
215
  def write(self, record: Record) -> None:
204
216
  """Write a record to the database"""
@@ -207,17 +219,23 @@ class SqliteWriter(AbstractWriter):
207
219
  self.descriptors_seen.add(desc)
208
220
  create_descriptor_table(self.con, desc)
209
221
  update_descriptor_columns(self.con, desc)
222
+ self.flush()
210
223
 
211
224
  db_insert_record(self.con, record)
212
225
  self.count += 1
213
226
 
214
227
  # Commit every batch_size records
215
228
  if self.count % self.batch_size == 0:
216
- self.con.commit()
229
+ self.flush()
230
+
231
+ def tx_cycle(self) -> None:
232
+ if self.con.in_transaction:
233
+ self.con.execute("COMMIT")
234
+ self.con.execute("BEGIN")
217
235
 
218
236
  def flush(self) -> None:
219
237
  if self.con:
220
- self.con.commit()
238
+ self.tx_cycle()
221
239
 
222
240
  def close(self) -> None:
223
241
  if self.con: