flow.record 3.19.dev8__tar.gz → 3.20.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/PKG-INFO +5 -1
  2. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/elastic.py +45 -10
  3. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/text.py +2 -2
  4. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/tools/rdump.py +23 -1
  5. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/version.py +2 -2
  6. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow.record.egg-info/PKG-INFO +5 -1
  7. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow.record.egg-info/requires.txt +5 -0
  8. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/pyproject.toml +5 -0
  9. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_rdump.py +26 -0
  10. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/.git-blame-ignore-revs +0 -0
  11. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/COPYRIGHT +0 -0
  12. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/LICENSE +0 -0
  13. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/MANIFEST.in +0 -0
  14. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/README.md +0 -0
  15. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/examples/filesystem.py +0 -0
  16. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/examples/passivedns.py +0 -0
  17. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/examples/records.json +0 -0
  18. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/examples/tcpconn.py +0 -0
  19. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/__init__.py +0 -0
  20. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/__init__.py +0 -0
  21. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/archive.py +0 -0
  22. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/avro.py +0 -0
  23. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/broker.py +0 -0
  24. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/csvfile.py +0 -0
  25. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/duckdb.py +0 -0
  26. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/jsonfile.py +0 -0
  27. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/line.py +0 -0
  28. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/mongo.py +0 -0
  29. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/split.py +0 -0
  30. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/splunk.py +0 -0
  31. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/sqlite.py +0 -0
  32. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/stream.py +0 -0
  33. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/xlsx.py +0 -0
  34. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/base.py +0 -0
  35. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/exceptions.py +0 -0
  36. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/fieldtypes/__init__.py +0 -0
  37. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/fieldtypes/credential.py +0 -0
  38. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/fieldtypes/net/__init__.py +0 -0
  39. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/fieldtypes/net/ip.py +0 -0
  40. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/fieldtypes/net/ipv4.py +0 -0
  41. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/fieldtypes/net/tcp.py +0 -0
  42. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/fieldtypes/net/udp.py +0 -0
  43. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/jsonpacker.py +0 -0
  44. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/packer.py +0 -0
  45. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/selector.py +0 -0
  46. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/stream.py +0 -0
  47. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/tools/__init__.py +0 -0
  48. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/tools/geoip.py +0 -0
  49. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/utils.py +0 -0
  50. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/whitelist.py +0 -0
  51. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow.record.egg-info/SOURCES.txt +0 -0
  52. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow.record.egg-info/dependency_links.txt +0 -0
  53. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow.record.egg-info/entry_points.txt +0 -0
  54. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow.record.egg-info/top_level.txt +0 -0
  55. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/setup.cfg +0 -0
  56. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/__init__.py +0 -0
  57. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/_utils.py +0 -0
  58. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/docs/Makefile +0 -0
  59. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/docs/conf.py +0 -0
  60. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/docs/index.rst +0 -0
  61. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/selector_explain_example.py +0 -0
  62. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/standalone_test.py +0 -0
  63. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_adapter_line.py +0 -0
  64. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_adapter_text.py +0 -0
  65. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_avro.py +0 -0
  66. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_avro_adapter.py +0 -0
  67. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_compiled_selector.py +0 -0
  68. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_csv_adapter.py +0 -0
  69. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_deprecations.py +0 -0
  70. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_elastic_adapter.py +0 -0
  71. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_fieldtype_ip.py +0 -0
  72. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_fieldtypes.py +0 -0
  73. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_json_packer.py +0 -0
  74. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_json_record_adapter.py +0 -0
  75. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_multi_timestamp.py +0 -0
  76. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_packer.py +0 -0
  77. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_record.py +0 -0
  78. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_record_adapter.py +0 -0
  79. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_record_descriptor.py +0 -0
  80. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_regression.py +0 -0
  81. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_selector.py +0 -0
  82. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_splunk_adapter.py +0 -0
  83. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_sqlite_duckdb_adapter.py +0 -0
  84. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_xlsx_adapter.py +0 -0
  85. {flow_record-3.19.dev8 → flow_record-3.20.dev1}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: flow.record
3
- Version: 3.19.dev8
3
+ Version: 3.20.dev1
4
4
  Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
5
5
  Author-email: Dissect Team <dissect@fox-it.com>
6
6
  License: Affero General Public License v3
@@ -43,6 +43,10 @@ Requires-Dist: flow.record[avro]; extra == "test"
43
43
  Requires-Dist: flow.record[elastic]; extra == "test"
44
44
  Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
45
45
  Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
46
+ Requires-Dist: tqdm; extra == "test"
47
+ Provides-Extra: full
48
+ Requires-Dist: flow.record[compression]; extra == "full"
49
+ Requires-Dist: tqdm; extra == "full"
46
50
 
47
51
  # flow.record
48
52
 
@@ -6,8 +6,14 @@ import queue
6
6
  import threading
7
7
  from typing import TYPE_CHECKING
8
8
 
9
- import elasticsearch
10
- import elasticsearch.helpers
9
+ try:
10
+ import elasticsearch
11
+ import elasticsearch.helpers
12
+
13
+ HAS_ELASTIC = True
14
+
15
+ except ImportError:
16
+ HAS_ELASTIC = False
11
17
 
12
18
  from flow.record.adapter import AbstractReader, AbstractWriter
13
19
  from flow.record.base import Record, RecordDescriptor
@@ -33,6 +39,8 @@ Optional arguments:
33
39
  [INDEX]: name of the index to use (default: records)
34
40
  [VERIFY_CERTS]: verify certs of Elasticsearch instance (default: True)
35
41
  [HASH_RECORD]: make record unique by hashing record [slow] (default: False)
42
+ [REQUEST_TIMEOUT]: maximum duration in seconds for a request to Elastic (default: 30)
43
+ [MAX_RETRIES]: maximum retries before a record is marked as failed (default: 3)
36
44
  [_META_*]: record metadata fields (default: None)
37
45
  """
38
46
 
@@ -49,8 +57,19 @@ class ElasticWriter(AbstractWriter):
49
57
  hash_record: str | bool = False,
50
58
  api_key: str | None = None,
51
59
  queue_size: int = 100000,
60
+ request_timeout: int = 30,
61
+ max_retries: int = 3,
52
62
  **kwargs,
53
63
  ) -> None:
64
+ """Initialize the ElasticWriter.
65
+
66
+ Resources:
67
+ - https://elasticsearch-py.readthedocs.io/en/v8.17.1/api/elasticsearch.html
68
+ """
69
+
70
+ if not HAS_ELASTIC:
71
+ raise RuntimeError("Required dependency 'elasticsearch' missing")
72
+
54
73
  self.index = index
55
74
  self.uri = uri
56
75
  verify_certs = str(verify_certs).lower() in ("1", "true")
@@ -63,20 +82,23 @@ class ElasticWriter(AbstractWriter):
63
82
 
64
83
  self.queue: queue.Queue[Record | StopIteration] = queue.Queue(maxsize=queue_size)
65
84
  self.event = threading.Event()
85
+ self.exception: Exception | None = None
86
+ threading.excepthook = self.excepthook
66
87
 
67
88
  self.es = elasticsearch.Elasticsearch(
68
89
  uri,
69
90
  verify_certs=verify_certs,
70
91
  http_compress=http_compress,
71
92
  api_key=api_key,
93
+ request_timeout=request_timeout,
94
+ retry_on_timeout=True,
95
+ max_retries=max_retries,
72
96
  )
73
97
 
74
98
  self.json_packer = JsonRecordPacker()
75
99
 
76
100
  self.thread = threading.Thread(target=self.streaming_bulk_thread)
77
101
  self.thread.start()
78
- self.exception: Exception | None = None
79
- threading.excepthook = self.excepthook
80
102
 
81
103
  if not verify_certs:
82
104
  # Disable InsecureRequestWarning of urllib3, caused by the verify_certs flag.
@@ -140,20 +162,28 @@ class ElasticWriter(AbstractWriter):
140
162
  yield self.record_to_document(record, index=self.index)
141
163
 
142
164
  def streaming_bulk_thread(self) -> None:
143
- """Thread that streams the documents to ES via the bulk api"""
165
+ """Thread that streams the documents to ES via the bulk api.
144
166
 
145
- for ok, item in elasticsearch.helpers.streaming_bulk(
167
+ Resources:
168
+ - https://elasticsearch-py.readthedocs.io/en/v8.17.1/helpers.html#elasticsearch.helpers.streaming_bulk
169
+ - https://github.com/elastic/elasticsearch-py/blob/main/elasticsearch/helpers/actions.py#L362
170
+ """
171
+ for _ok, _item in elasticsearch.helpers.streaming_bulk(
146
172
  self.es,
147
173
  self.document_stream(),
148
- raise_on_error=False,
149
- raise_on_exception=False,
174
+ raise_on_error=True,
175
+ raise_on_exception=True,
176
+ # Some settings have to be redefined because streaming_bulk does not inherit them from the self.es instance.
177
+ max_retries=3,
150
178
  ):
151
- if not ok:
152
- log.error("Failed to insert %r", item)
179
+ pass
153
180
 
154
181
  self.event.set()
155
182
 
156
183
  def write(self, record: Record) -> None:
184
+ if self.exception:
185
+ raise self.exception
186
+
157
187
  self.queue.put(record)
158
188
 
159
189
  def flush(self) -> None:
@@ -179,6 +209,8 @@ class ElasticReader(AbstractReader):
179
209
  http_compress: str | bool = True,
180
210
  selector: None | Selector | CompiledSelector = None,
181
211
  api_key: str | None = None,
212
+ request_timeout: int = 30,
213
+ max_retries: int = 3,
182
214
  **kwargs,
183
215
  ) -> None:
184
216
  self.index = index
@@ -195,6 +227,9 @@ class ElasticReader(AbstractReader):
195
227
  verify_certs=verify_certs,
196
228
  http_compress=http_compress,
197
229
  api_key=api_key,
230
+ request_timeout=request_timeout,
231
+ retry_on_timeout=True,
232
+ max_retries=max_retries,
198
233
  )
199
234
 
200
235
  if not verify_certs:
@@ -30,9 +30,9 @@ class DefaultMissing(dict):
30
30
 
31
31
  Example:
32
32
  >>> d = DefaultMissing({"foo": "bar"})
33
- >>> d['foo']
33
+ >>> d["foo"]
34
34
  'bar'
35
- >>> d['missing_key']
35
+ >>> d["missing_key"]
36
36
  '{missing_key}'
37
37
  """
38
38
 
@@ -21,6 +21,14 @@ try:
21
21
  except ImportError:
22
22
  version = "unknown"
23
23
 
24
+ try:
25
+ import tqdm
26
+
27
+ HAS_TQDM = True
28
+
29
+ except ImportError:
30
+ HAS_TQDM = False
31
+
24
32
  log = logging.getLogger(__name__)
25
33
 
26
34
 
@@ -112,6 +120,12 @@ def main(argv: list[str] | None = None) -> int:
112
120
  help="Generate suffixes of length LEN for splitted output files",
113
121
  )
114
122
  output.add_argument("--multi-timestamp", action="store_true", help="Create records for datetime fields")
123
+ output.add_argument(
124
+ "-p",
125
+ "--progress",
126
+ action="store_true",
127
+ help="Show progress bar (requires tqdm)",
128
+ )
115
129
 
116
130
  advanced = parser.add_argument_group("advanced")
117
131
  advanced.add_argument(
@@ -217,7 +231,14 @@ def main(argv: list[str] | None = None) -> int:
217
231
  seen_desc = set()
218
232
  islice_stop = (args.count + args.skip) if args.count else None
219
233
  record_iterator = islice(record_stream(args.src, selector), args.skip, islice_stop)
234
+
235
+ if args.progress:
236
+ if not HAS_TQDM:
237
+ parser.error("tqdm is required for progress bar")
238
+ record_iterator = tqdm.tqdm(record_iterator, unit=" records", delay=sys.float_info.min)
239
+
220
240
  count = 0
241
+ record_writer = None
221
242
 
222
243
  try:
223
244
  record_writer = RecordWriter(uri)
@@ -246,7 +267,8 @@ def main(argv: list[str] | None = None) -> int:
246
267
  record_writer.write(rec)
247
268
 
248
269
  finally:
249
- record_writer.__exit__()
270
+ if record_writer:
271
+ record_writer.__exit__()
250
272
 
251
273
  if args.list:
252
274
  print(f"Processed {count} records")
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '3.19.dev8'
16
- __version_tuple__ = version_tuple = (3, 19, 'dev8')
15
+ __version__ = version = '3.20.dev1'
16
+ __version_tuple__ = version_tuple = (3, 20, 'dev1')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: flow.record
3
- Version: 3.19.dev8
3
+ Version: 3.20.dev1
4
4
  Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
5
5
  Author-email: Dissect Team <dissect@fox-it.com>
6
6
  License: Affero General Public License v3
@@ -43,6 +43,10 @@ Requires-Dist: flow.record[avro]; extra == "test"
43
43
  Requires-Dist: flow.record[elastic]; extra == "test"
44
44
  Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
45
45
  Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
46
+ Requires-Dist: tqdm; extra == "test"
47
+ Provides-Extra: full
48
+ Requires-Dist: flow.record[compression]; extra == "full"
49
+ Requires-Dist: tqdm; extra == "full"
46
50
 
47
51
  # flow.record
48
52
 
@@ -20,6 +20,10 @@ pytz
20
20
  [elastic]
21
21
  elasticsearch
22
22
 
23
+ [full]
24
+ flow.record[compression]
25
+ tqdm
26
+
23
27
  [geoip]
24
28
  maxminddb
25
29
 
@@ -30,6 +34,7 @@ httpx
30
34
  flow.record[compression]
31
35
  flow.record[avro]
32
36
  flow.record[elastic]
37
+ tqdm
33
38
 
34
39
  [test:platform_python_implementation != "PyPy" and python_version < "3.12"]
35
40
  duckdb
@@ -62,6 +62,11 @@ test = [
62
62
  "flow.record[elastic]",
63
63
  "duckdb; platform_python_implementation != 'PyPy' and python_version < '3.12'", # duckdb
64
64
  "pytz; platform_python_implementation != 'PyPy' and python_version < '3.12'", # duckdb
65
+ "tqdm",
66
+ ]
67
+ full = [
68
+ "flow.record[compression]",
69
+ "tqdm",
65
70
  ]
66
71
 
67
72
  [project.scripts]
@@ -696,3 +696,29 @@ def test_rdump_line_verbose(tmp_path: Path, capsys: pytest.CaptureFixture, rdump
696
696
  assert "data (bytes) =" in captured.out
697
697
  assert "counter (uint32) =" in captured.out
698
698
  assert "foo (string) =" in captured.out
699
+
700
+
701
+ def test_rdump_list_progress(tmp_path: Path, capsys: pytest.CaptureFixture) -> None:
702
+ TestRecord = RecordDescriptor(
703
+ "test/rdump/progress",
704
+ [
705
+ ("uint32", "counter"),
706
+ ],
707
+ )
708
+ record_path = tmp_path / "test.records"
709
+
710
+ with RecordWriter(record_path) as writer:
711
+ for i in range(100):
712
+ writer.write(TestRecord(counter=i))
713
+
714
+ rdump.main(["--list", "--progress", str(record_path)])
715
+ captured = capsys.readouterr()
716
+
717
+ # stderr should contain tqdm progress bar
718
+ # 100 records [00:00, 64987.67 records/s]
719
+ assert "\r100 records [" in captured.err
720
+ assert " records/s]" in captured.err
721
+
722
+ # stdout should contain the RecordDescriptor definition and count
723
+ assert "# <RecordDescriptor test/rdump/progress, hash=eeb21156>" in captured.out
724
+ assert "Processed 100 records" in captured.out
File without changes
File without changes