flow.record 3.18.dev2__tar.gz → 3.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. flow_record-3.19/.git-blame-ignore-revs +6 -0
  2. {flow_record-3.18.dev2/flow.record.egg-info → flow_record-3.19}/PKG-INFO +2 -2
  3. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/__init__.py +27 -35
  4. flow_record-3.19/flow/record/adapter/__init__.py +53 -0
  5. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/archive.py +12 -5
  6. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/avro.py +18 -15
  7. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/broker.py +16 -8
  8. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/csvfile.py +26 -12
  9. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/elastic.py +60 -16
  10. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/jsonfile.py +20 -9
  11. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/line.py +3 -6
  12. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/mongo.py +17 -8
  13. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/split.py +12 -5
  14. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/splunk.py +39 -41
  15. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/sqlite.py +7 -4
  16. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/stream.py +10 -4
  17. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/text.py +25 -9
  18. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/xlsx.py +17 -9
  19. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/base.py +107 -123
  20. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/fieldtypes/__init__.py +156 -160
  21. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/fieldtypes/credential.py +2 -0
  22. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/fieldtypes/net/__init__.py +12 -4
  23. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/fieldtypes/net/ip.py +63 -10
  24. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/fieldtypes/net/ipv4.py +35 -34
  25. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/fieldtypes/net/tcp.py +2 -0
  26. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/fieldtypes/net/udp.py +2 -0
  27. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/jsonpacker.py +20 -20
  28. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/packer.py +26 -22
  29. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/selector.py +105 -119
  30. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/stream.py +66 -53
  31. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/tools/geoip.py +18 -15
  32. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/tools/rdump.py +10 -8
  33. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/utils.py +11 -10
  34. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/version.py +2 -2
  35. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/whitelist.py +3 -0
  36. {flow_record-3.18.dev2 → flow_record-3.19/flow.record.egg-info}/PKG-INFO +2 -2
  37. {flow_record-3.18.dev2 → flow_record-3.19}/flow.record.egg-info/SOURCES.txt +1 -0
  38. {flow_record-3.18.dev2 → flow_record-3.19}/pyproject.toml +48 -5
  39. {flow_record-3.18.dev2 → flow_record-3.19}/tests/_utils.py +10 -2
  40. {flow_record-3.18.dev2 → flow_record-3.19}/tests/docs/Makefile +1 -1
  41. {flow_record-3.18.dev2 → flow_record-3.19}/tests/docs/conf.py +5 -0
  42. {flow_record-3.18.dev2 → flow_record-3.19}/tests/selector_explain_example.py +4 -2
  43. {flow_record-3.18.dev2 → flow_record-3.19}/tests/standalone_test.py +6 -4
  44. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_adapter_line.py +3 -1
  45. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_adapter_text.py +3 -1
  46. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_avro.py +13 -7
  47. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_avro_adapter.py +13 -6
  48. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_compiled_selector.py +5 -3
  49. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_csv_adapter.py +6 -1
  50. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_deprecations.py +6 -4
  51. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_elastic_adapter.py +7 -1
  52. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_fieldtype_ip.py +97 -17
  53. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_fieldtypes.py +61 -63
  54. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_json_packer.py +5 -3
  55. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_json_record_adapter.py +24 -18
  56. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_multi_timestamp.py +13 -11
  57. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_packer.py +14 -12
  58. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_rdump.py +57 -39
  59. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_record.py +47 -42
  60. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_record_adapter.py +81 -81
  61. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_record_descriptor.py +10 -8
  62. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_regression.py +50 -45
  63. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_selector.py +53 -54
  64. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_splunk_adapter.py +22 -17
  65. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_sqlite_duckdb_adapter.py +21 -13
  66. {flow_record-3.18.dev2 → flow_record-3.19}/tests/test_xlsx_adapter.py +8 -3
  67. {flow_record-3.18.dev2 → flow_record-3.19}/tox.ini +5 -19
  68. flow_record-3.18.dev2/flow/record/adapter/__init__.py +0 -63
  69. {flow_record-3.18.dev2 → flow_record-3.19}/COPYRIGHT +0 -0
  70. {flow_record-3.18.dev2 → flow_record-3.19}/LICENSE +0 -0
  71. {flow_record-3.18.dev2 → flow_record-3.19}/MANIFEST.in +0 -0
  72. {flow_record-3.18.dev2 → flow_record-3.19}/README.md +0 -0
  73. {flow_record-3.18.dev2 → flow_record-3.19}/examples/filesystem.py +0 -0
  74. {flow_record-3.18.dev2 → flow_record-3.19}/examples/passivedns.py +0 -0
  75. {flow_record-3.18.dev2 → flow_record-3.19}/examples/records.json +0 -0
  76. {flow_record-3.18.dev2 → flow_record-3.19}/examples/tcpconn.py +0 -0
  77. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/adapter/duckdb.py +0 -0
  78. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/exceptions.py +0 -0
  79. {flow_record-3.18.dev2 → flow_record-3.19}/flow/record/tools/__init__.py +0 -0
  80. {flow_record-3.18.dev2 → flow_record-3.19}/flow.record.egg-info/dependency_links.txt +0 -0
  81. {flow_record-3.18.dev2 → flow_record-3.19}/flow.record.egg-info/entry_points.txt +0 -0
  82. {flow_record-3.18.dev2 → flow_record-3.19}/flow.record.egg-info/requires.txt +0 -0
  83. {flow_record-3.18.dev2 → flow_record-3.19}/flow.record.egg-info/top_level.txt +0 -0
  84. {flow_record-3.18.dev2 → flow_record-3.19}/setup.cfg +0 -0
  85. {flow_record-3.18.dev2 → flow_record-3.19}/tests/__init__.py +0 -0
  86. {flow_record-3.18.dev2 → flow_record-3.19}/tests/docs/index.rst +0 -0
@@ -0,0 +1,6 @@
1
+ # Formatting commits. You can ignore them during git-blame with `--ignore-rev` or `--ignore-revs-file`.
2
+ #
3
+ # $ git config --add 'blame.ignoreRevsFile' '.git-blame-ignore-revs'
4
+ #
5
+ # Change linter to Ruff (#158)
6
+ c67f778c653c295ec26146cf6422d3b06ac640e8
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: flow.record
3
- Version: 3.18.dev2
3
+ Version: 3.19
4
4
  Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
5
5
  Author-email: Dissect Team <dissect@fox-it.com>
6
6
  License: Affero General Public License v3
@@ -1,5 +1,7 @@
1
+ from __future__ import annotations
2
+
1
3
  import gzip
2
- import os
4
+ from pathlib import Path
3
5
 
4
6
  from flow.record.base import (
5
7
  IGNORE_FIELDS_FOR_COMPARISON,
@@ -39,71 +41,61 @@ from flow.record.stream import (
39
41
 
40
42
  __all__ = [
41
43
  "IGNORE_FIELDS_FOR_COMPARISON",
42
- "RECORD_VERSION",
43
44
  "RECORDSTREAM_MAGIC",
45
+ "RECORD_VERSION",
46
+ "DynamicDescriptor",
44
47
  "FieldType",
45
- "Record",
46
48
  "GroupedRecord",
47
- "RecordDescriptor",
49
+ "JsonRecordPacker",
50
+ "PathTemplateWriter",
51
+ "Record",
48
52
  "RecordAdapter",
53
+ "RecordArchiver",
54
+ "RecordDescriptor",
55
+ "RecordDescriptorError",
49
56
  "RecordField",
50
- "RecordReader",
51
- "RecordWriter",
52
57
  "RecordOutput",
53
- "RecordPrinter",
54
58
  "RecordPacker",
55
- "JsonRecordPacker",
56
- "RecordStreamWriter",
59
+ "RecordPrinter",
60
+ "RecordReader",
57
61
  "RecordStreamReader",
58
- "open_path_or_stream",
62
+ "RecordStreamWriter",
63
+ "RecordWriter",
64
+ "dynamic_fieldtype",
65
+ "extend_record",
66
+ "ignore_fields_for_comparison",
67
+ "iter_timestamped_records",
59
68
  "open_path",
69
+ "open_path_or_stream",
60
70
  "open_stream",
61
- "ignore_fields_for_comparison",
71
+ "record_stream",
62
72
  "set_ignored_fields_for_comparison",
63
73
  "stream",
64
- "dynamic_fieldtype",
65
- "DynamicDescriptor",
66
- "PathTemplateWriter",
67
- "RecordArchiver",
68
- "RecordDescriptorError",
69
- "record_stream",
70
- "extend_record",
71
- "iter_timestamped_records",
72
74
  ]
73
75
 
74
76
 
75
- class View:
76
- fields = None
77
-
78
- def __init__(self, fields):
79
- self.fields = fields
80
-
81
- def __iter__(self, fields):
82
- pass
83
-
84
-
85
77
  class RecordDateSplitter:
86
78
  basepath = None
87
79
  out = None
88
80
 
89
- def __init__(self, basepath):
90
- self.basepath = basepath
81
+ def __init__(self, basepath: str | Path):
82
+ self.basepath = Path(basepath)
91
83
  self.out = {}
92
84
 
93
- def getstream(self, t):
85
+ def getstream(self, t: tuple[int, int, int]) -> RecordStreamWriter:
94
86
  if t not in self.out:
95
- path = os.path.join(self.basepath, "-".join(["{:2d}".format(v) for v in t]) + ".rec.gz")
87
+ path = self.basepath.joinpath("-".join([f"{v:2d}" for v in t]) + ".rec.gz")
96
88
  f = gzip.GzipFile(path, "wb")
97
89
  rs = RecordStreamWriter(f)
98
90
  self.out[t] = rs
99
91
  return self.out[t]
100
92
 
101
- def write(self, r):
93
+ def write(self, r: Record) -> None:
102
94
  t = (r.ts.year, r.ts.month, r.ts.day)
103
95
  rs = self.getstream(t)
104
96
  rs.write(r)
105
97
  rs.fp.flush()
106
98
 
107
- def close(self):
99
+ def close(self) -> None:
108
100
  for rs in self.out.values():
109
101
  rs.close()
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # make this namespace extensible from other packages
4
+ import abc
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from collections.abc import Iterator
9
+
10
+ from flow.record.base import Record
11
+
12
+
13
+ class AbstractWriter(metaclass=abc.ABCMeta):
14
+ @abc.abstractmethod
15
+ def write(self, rec: Record) -> None:
16
+ """Write a record."""
17
+ raise NotImplementedError
18
+
19
+ @abc.abstractmethod
20
+ def flush(self) -> None:
21
+ """Flush any buffered writes."""
22
+ raise NotImplementedError
23
+
24
+ @abc.abstractmethod
25
+ def close(self) -> None:
26
+ """Close the Writer, no more writes will be possible."""
27
+ raise NotImplementedError
28
+
29
+ def __del__(self) -> None:
30
+ self.close()
31
+
32
+ def __enter__(self) -> AbstractWriter: # noqa: PYI034
33
+ return self
34
+
35
+ def __exit__(self, *args) -> None:
36
+ self.flush()
37
+ self.close()
38
+
39
+
40
+ class AbstractReader(metaclass=abc.ABCMeta):
41
+ @abc.abstractmethod
42
+ def __iter__(self) -> Iterator[Record]:
43
+ """Return a record iterator."""
44
+ raise NotImplementedError
45
+
46
+ def close(self) -> None: # noqa: B027
47
+ """Close the Reader, can be overriden to properly free resources."""
48
+
49
+ def __enter__(self) -> AbstractReader: # noqa: PYI034
50
+ return self
51
+
52
+ def __exit__(self, *args) -> None:
53
+ self.close()
@@ -1,6 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
1
5
  from flow.record.adapter import AbstractReader, AbstractWriter
2
6
  from flow.record.stream import RecordArchiver
3
7
 
8
+ if TYPE_CHECKING:
9
+ from flow.record.base import Record
10
+
4
11
  __usage__ = """
5
12
  Record archiver adapter, writes records to YYYY/mm/dd directories (writer only)
6
13
  ---
@@ -12,7 +19,7 @@ Write usage: rdump -w archive://[PATH]
12
19
  class ArchiveWriter(AbstractWriter):
13
20
  writer = None
14
21
 
15
- def __init__(self, path, **kwargs):
22
+ def __init__(self, path: str, **kwargs):
16
23
  self.path = path
17
24
 
18
25
  path_template = kwargs.get("path_template")
@@ -20,19 +27,19 @@ class ArchiveWriter(AbstractWriter):
20
27
 
21
28
  self.writer = RecordArchiver(self.path, path_template=path_template, name=name)
22
29
 
23
- def write(self, r):
30
+ def write(self, r: Record) -> None:
24
31
  self.writer.write(r)
25
32
 
26
- def flush(self):
33
+ def flush(self) -> None:
27
34
  # RecordArchiver already flushes after every write
28
35
  pass
29
36
 
30
- def close(self):
37
+ def close(self) -> None:
31
38
  if self.writer:
32
39
  self.writer.close()
33
40
  self.writer = None
34
41
 
35
42
 
36
43
  class ArchiveReader(AbstractReader):
37
- def __init__(self, path, **kwargs):
44
+ def __init__(self, path: str, **kwargs):
38
45
  raise NotImplementedError
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import json
4
4
  from datetime import datetime, timedelta, timezone
5
5
  from importlib.util import find_spec
6
- from typing import Any, Iterator
6
+ from typing import TYPE_CHECKING, Any, BinaryIO
7
7
 
8
8
  import fastavro
9
9
 
@@ -12,6 +12,10 @@ from flow.record.adapter import AbstractReader, AbstractWriter
12
12
  from flow.record.selector import make_selector
13
13
  from flow.record.utils import is_stdout
14
14
 
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Iterator
17
+ from pathlib import Path
18
+
15
19
  __usage__ = """
16
20
  Apache AVRO adapter
17
21
  ---
@@ -52,7 +56,7 @@ class AvroWriter(AbstractWriter):
52
56
  fp = None
53
57
  writer = None
54
58
 
55
- def __init__(self, path, key=None, **kwargs):
59
+ def __init__(self, path: str | Path | BinaryIO, **kwargs):
56
60
  self.fp = record.open_path_or_stream(path, "wb")
57
61
 
58
62
  self.desc = None
@@ -69,11 +73,11 @@ class AvroWriter(AbstractWriter):
69
73
  self.writer = fastavro.write.Writer(self.fp, self.parsed_schema, codec=self.codec)
70
74
 
71
75
  if self.desc != r._desc:
72
- raise Exception("Mixed record types")
76
+ raise ValueError("Mixed record types")
73
77
 
74
78
  self.writer.write(r._packdict())
75
79
 
76
- def flush(self):
80
+ def flush(self) -> None:
77
81
  if not self.writer:
78
82
  self.writer = fastavro.write.Writer(
79
83
  self.fp,
@@ -92,21 +96,21 @@ class AvroWriter(AbstractWriter):
92
96
  class AvroReader(AbstractReader):
93
97
  fp = None
94
98
 
95
- def __init__(self, path, selector=None, **kwargs):
99
+ def __init__(self, path: str, selector: str | None = None, **kwargs):
96
100
  self.fp = record.open_path_or_stream(path, "rb")
97
101
  self.selector = make_selector(selector)
98
102
 
99
103
  self.reader = fastavro.reader(self.fp)
100
104
  self.schema = self.reader.writer_schema
101
105
  if not self.schema:
102
- raise Exception("Missing Avro schema")
106
+ raise ValueError("Missing Avro schema")
103
107
 
104
108
  self.desc = schema_to_descriptor(self.schema)
105
109
 
106
110
  # Store the fieldnames that are of type "datetime"
107
- self.datetime_fields = set(
111
+ self.datetime_fields = {
108
112
  name for name, field in self.desc.get_all_fields().items() if field.typename == "datetime"
109
- )
113
+ }
110
114
 
111
115
  def __iter__(self) -> Iterator[record.Record]:
112
116
  for obj in self.reader:
@@ -149,7 +153,7 @@ def descriptor_to_schema(desc: record.RecordDescriptor) -> dict[str, Any]:
149
153
  else:
150
154
  avro_type = AVRO_TYPE_MAP.get(field_type)
151
155
  if not avro_type:
152
- raise Exception("Unsupported Avro type: {}".format(field_type))
156
+ raise ValueError(f"Unsupported Avro type: {field_type}")
153
157
 
154
158
  field_schema["type"] = [avro_type, "null"]
155
159
 
@@ -190,11 +194,10 @@ def avro_type_to_flow_type(ftype: list) -> str:
190
194
  if isinstance(t, dict):
191
195
  if t.get("type") == "array":
192
196
  item_type = avro_type_to_flow_type(t.get("items"))
193
- return "{}[]".format(item_type)
194
- else:
195
- logical_type = t.get("logicalType")
196
- if logical_type and ("time" in logical_type or "date" in logical_type):
197
- return "datetime"
197
+ return f"{item_type}[]"
198
+ logical_type = t.get("logicalType")
199
+ if logical_type and ("time" in logical_type or "date" in logical_type):
200
+ return "datetime"
198
201
 
199
202
  if t == "null":
200
203
  continue
@@ -202,4 +205,4 @@ def avro_type_to_flow_type(ftype: list) -> str:
202
205
  if t in RECORD_TYPE_MAP:
203
206
  return RECORD_TYPE_MAP[t]
204
207
 
205
- raise TypeError("Can't map avro type to flow type: {}".format(t))
208
+ raise TypeError(f"Can't map avro type to flow type: {t}")
@@ -1,7 +1,15 @@
1
- from flow.broker import Publisher, Subscriber
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
2
4
 
5
+ from flow.broker import Publisher, Subscriber
3
6
  from flow.record.adapter import AbstractReader, AbstractWriter
4
7
 
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Iterator
10
+
11
+ from flow.record.base import Record
12
+
5
13
  __usage__ = """
6
14
  PubSub adapter using flow.broker
7
15
  ---
@@ -13,23 +21,23 @@ Read usage: rdump broker+tcp://[IP]:[PORT] -s True
13
21
  class BrokerWriter(AbstractWriter):
14
22
  publisher = None
15
23
 
16
- def __init__(self, uri, source=None, classification=None, **kwargs):
24
+ def __init__(self, uri: str, source: str | None = None, classification: str | None = None, **kwargs):
17
25
  self.publisher = Publisher(uri, **kwargs)
18
26
  self.source = source
19
27
  self.classification = classification
20
28
 
21
- def write(self, r):
29
+ def write(self, r: Record) -> None:
22
30
  record = r._replace(
23
31
  _source=self.source or r._source,
24
32
  _classification=self.classification or r._classification,
25
33
  )
26
34
  self.publisher.send(record)
27
35
 
28
- def flush(self):
36
+ def flush(self) -> None:
29
37
  if self.publisher:
30
38
  self.publisher.flush()
31
39
 
32
- def close(self):
40
+ def close(self) -> None:
33
41
  if self.publisher:
34
42
  if hasattr(self.publisher, "stop"):
35
43
  # Requires flow.broker >= 1.1.1
@@ -42,14 +50,14 @@ class BrokerWriter(AbstractWriter):
42
50
  class BrokerReader(AbstractReader):
43
51
  subscriber = None
44
52
 
45
- def __init__(self, uri, name=None, selector=None, **kwargs):
53
+ def __init__(self, uri: str, name: str | None = None, selector: str | None = None, **kwargs):
46
54
  self.subscriber = Subscriber(uri, **kwargs)
47
55
  self.subscription = self.subscriber.select(name, str(selector))
48
56
 
49
- def __iter__(self):
57
+ def __iter__(self) -> Iterator[Record]:
50
58
  return iter(self.subscription)
51
59
 
52
- def close(self):
60
+ def close(self) -> None:
53
61
  if self.subscriber:
54
62
  self.subscriber.stop()
55
63
  self.subscriber = None
@@ -1,14 +1,19 @@
1
- from __future__ import absolute_import
1
+ from __future__ import annotations
2
2
 
3
3
  import csv
4
4
  import sys
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
5
7
 
6
8
  from flow.record import RecordDescriptor
7
9
  from flow.record.adapter import AbstractReader, AbstractWriter
8
- from flow.record.base import normalize_fieldname
10
+ from flow.record.base import Record, normalize_fieldname
9
11
  from flow.record.selector import make_selector
10
12
  from flow.record.utils import is_stdout
11
13
 
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Iterator
16
+
12
17
  __usage__ = """
13
18
  Comma-separated values (CSV) adapter
14
19
  ---
@@ -23,13 +28,20 @@ Optional parameters:
23
28
 
24
29
 
25
30
  class CsvfileWriter(AbstractWriter):
26
- def __init__(self, path, fields=None, exclude=None, lineterminator=None, **kwargs):
31
+ def __init__(
32
+ self,
33
+ path: str | Path | None,
34
+ fields: str | list[str] | None = None,
35
+ exclude: str | list[str] | None = None,
36
+ lineterminator: str = "\r\n",
37
+ **kwargs,
38
+ ):
27
39
  self.fp = None
28
40
  if path in (None, "", "-"):
29
41
  self.fp = sys.stdout
30
42
  else:
31
- self.fp = open(path, "w", newline="")
32
- self.lineterminator = lineterminator or "\r\n"
43
+ self.fp = Path(path).open("w", newline="") # noqa: SIM115
44
+ self.lineterminator = lineterminator
33
45
  for r, n in ((r"\r", "\r"), (r"\n", "\n"), (r"\t", "\t")):
34
46
  self.lineterminator = self.lineterminator.replace(r, n)
35
47
  self.desc = None
@@ -41,7 +53,7 @@ class CsvfileWriter(AbstractWriter):
41
53
  if isinstance(self.exclude, str):
42
54
  self.exclude = self.exclude.split(",")
43
55
 
44
- def write(self, r):
56
+ def write(self, r: Record) -> None:
45
57
  rdict = r._asdict(fields=self.fields, exclude=self.exclude)
46
58
  if not self.desc or self.desc != r._desc:
47
59
  self.desc = r._desc
@@ -49,24 +61,26 @@ class CsvfileWriter(AbstractWriter):
49
61
  self.writer.writeheader()
50
62
  self.writer.writerow(rdict)
51
63
 
52
- def flush(self):
64
+ def flush(self) -> None:
53
65
  if self.fp:
54
66
  self.fp.flush()
55
67
 
56
- def close(self):
68
+ def close(self) -> None:
57
69
  if self.fp and not is_stdout(self.fp):
58
70
  self.fp.close()
59
71
  self.fp = None
60
72
 
61
73
 
62
74
  class CsvfileReader(AbstractReader):
63
- def __init__(self, path, selector=None, fields=None, **kwargs):
75
+ def __init__(
76
+ self, path: str | Path | None, selector: str | None = None, fields: str | list[str] | None = None, **kwargs
77
+ ):
64
78
  self.fp = None
65
79
  self.selector = make_selector(selector)
66
80
  if path in (None, "", "-"):
67
81
  self.fp = sys.stdin
68
82
  else:
69
- self.fp = open(path, "r", newline="")
83
+ self.fp = Path(path).open("r", newline="") # noqa: SIM115
70
84
 
71
85
  self.dialect = "excel"
72
86
  if self.fp.seekable():
@@ -87,12 +101,12 @@ class CsvfileReader(AbstractReader):
87
101
  # Create RecordDescriptor from fields, skipping fields starting with "_" (reserved for internal use)
88
102
  self.desc = RecordDescriptor("csv/reader", [("string", col) for col in self.fields if not col.startswith("_")])
89
103
 
90
- def close(self):
104
+ def close(self) -> None:
91
105
  if self.fp:
92
106
  self.fp.close()
93
107
  self.fp = None
94
108
 
95
- def __iter__(self):
109
+ def __iter__(self) -> Iterator[Record]:
96
110
  for row in self.reader:
97
111
  rdict = dict(zip(self.fields, row))
98
112
  record = self.desc.init_from_dict(rdict)
@@ -4,16 +4,26 @@ import hashlib
4
4
  import logging
5
5
  import queue
6
6
  import threading
7
- from typing import Iterator
7
+ from typing import TYPE_CHECKING
8
8
 
9
- import elasticsearch
10
- import elasticsearch.helpers
9
+ try:
10
+ import elasticsearch
11
+ import elasticsearch.helpers
12
+
13
+ HAS_ELASTIC = True
14
+
15
+ except ImportError:
16
+ HAS_ELASTIC = False
11
17
 
12
18
  from flow.record.adapter import AbstractReader, AbstractWriter
13
19
  from flow.record.base import Record, RecordDescriptor
14
20
  from flow.record.fieldtypes import fieldtype_for_value
15
21
  from flow.record.jsonpacker import JsonRecordPacker
16
- from flow.record.selector import CompiledSelector, Selector
22
+
23
+ if TYPE_CHECKING:
24
+ from collections.abc import Iterator
25
+
26
+ from flow.record.selector import CompiledSelector, Selector
17
27
 
18
28
  __usage__ = """
19
29
  ElasticSearch adapter
@@ -25,9 +35,12 @@ Read usage: rdump elastic+[PROTOCOL]://[IP]:[PORT]?index=[INDEX]
25
35
 
26
36
  Optional arguments:
27
37
  [API_KEY]: base64 encoded api key to authenticate with (default: False)
38
+ [QUEUE_SIZE]: maximum queue size for writing records; limits memory usage (default: 100000)
28
39
  [INDEX]: name of the index to use (default: records)
29
40
  [VERIFY_CERTS]: verify certs of Elasticsearch instance (default: True)
30
41
  [HASH_RECORD]: make record unique by hashing record [slow] (default: False)
42
+ [REQUEST_TIMEOUT]: maximum duration in seconds for a request to Elastic (default: 30)
43
+ [MAX_RETRIES]: maximum retries before a record is marked as failed (default: 3)
31
44
  [_META_*]: record metadata fields (default: None)
32
45
  """
33
46
 
@@ -43,33 +56,49 @@ class ElasticWriter(AbstractWriter):
43
56
  http_compress: str | bool = True,
44
57
  hash_record: str | bool = False,
45
58
  api_key: str | None = None,
59
+ queue_size: int = 100000,
60
+ request_timeout: int = 30,
61
+ max_retries: int = 3,
46
62
  **kwargs,
47
63
  ) -> None:
64
+ """Initialize the ElasticWriter.
65
+
66
+ Resources:
67
+ - https://elasticsearch-py.readthedocs.io/en/v8.17.1/api/elasticsearch.html
68
+ """
69
+
70
+ if not HAS_ELASTIC:
71
+ raise RuntimeError("Required dependency 'elasticsearch' missing")
72
+
48
73
  self.index = index
49
74
  self.uri = uri
50
75
  verify_certs = str(verify_certs).lower() in ("1", "true")
51
76
  http_compress = str(http_compress).lower() in ("1", "true")
52
77
  self.hash_record = str(hash_record).lower() in ("1", "true")
78
+ queue_size = int(queue_size)
53
79
 
54
80
  if not uri.lower().startswith(("http://", "https://")):
55
81
  uri = "http://" + uri
56
82
 
57
- self.queue: queue.Queue[Record | StopIteration] = queue.Queue()
83
+ self.queue: queue.Queue[Record | StopIteration] = queue.Queue(maxsize=queue_size)
58
84
  self.event = threading.Event()
85
+ self.exception: Exception | None = None
86
+ threading.excepthook = self.excepthook
59
87
 
60
88
  self.es = elasticsearch.Elasticsearch(
61
89
  uri,
62
90
  verify_certs=verify_certs,
63
91
  http_compress=http_compress,
64
92
  api_key=api_key,
93
+ request_timeout=request_timeout,
94
+ retry_on_timeout=True,
95
+ max_retries=max_retries,
65
96
  )
66
97
 
67
98
  self.json_packer = JsonRecordPacker()
68
99
 
69
100
  self.thread = threading.Thread(target=self.streaming_bulk_thread)
70
101
  self.thread.start()
71
- self.exception: Exception | None = None
72
- threading.excepthook = self.excepthook
73
102
 
74
103
  if not verify_certs:
75
104
  # Disable InsecureRequestWarning of urllib3, caused by the verify_certs flag.
@@ -83,8 +112,8 @@ class ElasticWriter(AbstractWriter):
83
112
  self.metadata_fields[arg_key[6:]] = arg_val
84
113
 
85
114
  def excepthook(self, exc: threading.ExceptHookArgs, *args, **kwargs) -> None:
86
- log.error("Exception in thread: %s", exc.exc_value.message)
87
- self.exception = exc.exc_value
115
+ log.error("Exception in thread: %s", exc)
116
+ self.exception = getattr(exc, "exc_value", exc)
88
117
  self.event.set()
89
118
  self.close()
90
119
 
@@ -128,24 +157,34 @@ class ElasticWriter(AbstractWriter):
128
157
  record = self.queue.get()
129
158
  if record is StopIteration:
130
159
  break
160
+ if not record:
161
+ continue
131
162
  yield self.record_to_document(record, index=self.index)
132
163
 
133
164
  def streaming_bulk_thread(self) -> None:
134
- """Thread that streams the documents to ES via the bulk api"""
165
+ """Thread that streams the documents to ES via the bulk api.
135
166
 
136
- for ok, item in elasticsearch.helpers.streaming_bulk(
167
+ Resources:
168
+ - https://elasticsearch-py.readthedocs.io/en/v8.17.1/helpers.html#elasticsearch.helpers.streaming_bulk
169
+ - https://github.com/elastic/elasticsearch-py/blob/main/elasticsearch/helpers/actions.py#L362
170
+ """
171
+ for _ok, _item in elasticsearch.helpers.streaming_bulk(
137
172
  self.es,
138
173
  self.document_stream(),
139
- raise_on_error=False,
140
- raise_on_exception=False,
174
+ raise_on_error=True,
175
+ raise_on_exception=True,
176
+ # Some settings have to be redefined because streaming_bulk does not inherit them from the self.es instance.
177
+ max_retries=3,
141
178
  ):
142
- if not ok:
143
- log.error("Failed to insert %r", item)
179
+ pass
144
180
 
145
181
  self.event.set()
146
182
 
147
183
  def write(self, record: Record) -> None:
148
- self.queue.put_nowait(record)
184
+ if self.exception:
185
+ raise self.exception
186
+
187
+ self.queue.put(record)
149
188
 
150
189
  def flush(self) -> None:
151
190
  pass
@@ -170,6 +209,8 @@ class ElasticReader(AbstractReader):
170
209
  http_compress: str | bool = True,
171
210
  selector: None | Selector | CompiledSelector = None,
172
211
  api_key: str | None = None,
212
+ request_timeout: int = 30,
213
+ max_retries: int = 3,
173
214
  **kwargs,
174
215
  ) -> None:
175
216
  self.index = index
@@ -186,6 +227,9 @@ class ElasticReader(AbstractReader):
186
227
  verify_certs=verify_certs,
187
228
  http_compress=http_compress,
188
229
  api_key=api_key,
230
+ request_timeout=request_timeout,
231
+ retry_on_timeout=True,
232
+ max_retries=max_retries,
189
233
  )
190
234
 
191
235
  if not verify_certs: