PyPI - flow.record - Versions diffs - 3.19.dev8__tar.gz → 3.20.dev1__tar.gz - Mend

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: flow.record
-Version: 3.19.dev8
+Version: 3.20.dev1
 Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
 Author-email: Dissect Team <dissect@fox-it.com>
 License: Affero General Public License v3
@@ -43,6 +43,10 @@ Requires-Dist: flow.record[avro]; extra == "test"
 Requires-Dist: flow.record[elastic]; extra == "test"
 Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
 Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
+Requires-Dist: tqdm; extra == "test"
+Provides-Extra: full
+Requires-Dist: flow.record[compression]; extra == "full"
+Requires-Dist: tqdm; extra == "full"
 # flow.record

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/elastic.py RENAMED Viewed

@@ -6,8 +6,14 @@ import queue
 import threading
 from typing import TYPE_CHECKING
-import elasticsearch
-import elasticsearch.helpers
+try:
+    import elasticsearch
+    import elasticsearch.helpers
+    HAS_ELASTIC = True
+except ImportError:
+    HAS_ELASTIC = False
 from flow.record.adapter import AbstractReader, AbstractWriter
 from flow.record.base import Record, RecordDescriptor
@@ -33,6 +39,8 @@ Optional arguments:
   [INDEX]: name of the index to use (default: records)
   [VERIFY_CERTS]: verify certs of Elasticsearch instance (default: True)
   [HASH_RECORD]: make record unique by hashing record [slow] (default: False)
+  [REQUEST_TIMEOUT]: maximum duration in seconds for a request to Elastic (default: 30)
+  [MAX_RETRIES]: maximum retries before a record is marked as failed (default: 3)
   [_META_*]: record metadata fields (default: None)
 """
@@ -49,8 +57,19 @@ class ElasticWriter(AbstractWriter):
         hash_record: str | bool = False,
         api_key: str | None = None,
         queue_size: int = 100000,
+        request_timeout: int = 30,
+        max_retries: int = 3,
         **kwargs,
     ) -> None:
+        """Initialize the ElasticWriter.
+        Resources:
+            - https://elasticsearch-py.readthedocs.io/en/v8.17.1/api/elasticsearch.html
+        """
+        if not HAS_ELASTIC:
+            raise RuntimeError("Required dependency 'elasticsearch' missing")
         self.index = index
         self.uri = uri
         verify_certs = str(verify_certs).lower() in ("1", "true")
@@ -63,20 +82,23 @@ class ElasticWriter(AbstractWriter):
         self.queue: queue.Queue[Record | StopIteration] = queue.Queue(maxsize=queue_size)
         self.event = threading.Event()
+        self.exception: Exception | None = None
+        threading.excepthook = self.excepthook
         self.es = elasticsearch.Elasticsearch(
             uri,
             verify_certs=verify_certs,
             http_compress=http_compress,
             api_key=api_key,
+            request_timeout=request_timeout,
+            retry_on_timeout=True,
+            max_retries=max_retries,
         )
         self.json_packer = JsonRecordPacker()
         self.thread = threading.Thread(target=self.streaming_bulk_thread)
         self.thread.start()
-        self.exception: Exception | None = None
-        threading.excepthook = self.excepthook
         if not verify_certs:
             # Disable InsecureRequestWarning of urllib3, caused by the verify_certs flag.
@@ -140,20 +162,28 @@ class ElasticWriter(AbstractWriter):
             yield self.record_to_document(record, index=self.index)
     def streaming_bulk_thread(self) -> None:
-        """Thread that streams the documents to ES via the bulk api"""
+        """Thread that streams the documents to ES via the bulk api.
-        for ok, item in elasticsearch.helpers.streaming_bulk(
+        Resources:
+            - https://elasticsearch-py.readthedocs.io/en/v8.17.1/helpers.html#elasticsearch.helpers.streaming_bulk
+            - https://github.com/elastic/elasticsearch-py/blob/main/elasticsearch/helpers/actions.py#L362
+        """
+        for _ok, _item in elasticsearch.helpers.streaming_bulk(
             self.es,
             self.document_stream(),
-            raise_on_error=False,
-            raise_on_exception=False,
+            raise_on_error=True,
+            raise_on_exception=True,
+            # Some settings have to be redefined because streaming_bulk does not inherit them from the self.es instance.
+            max_retries=3,
         ):
-            if not ok:
-                log.error("Failed to insert %r", item)
+            pass
         self.event.set()
     def write(self, record: Record) -> None:
+        if self.exception:
+            raise self.exception
         self.queue.put(record)
     def flush(self) -> None:
@@ -179,6 +209,8 @@ class ElasticReader(AbstractReader):
         http_compress: str | bool = True,
         selector: None | Selector | CompiledSelector = None,
         api_key: str | None = None,
+        request_timeout: int = 30,
+        max_retries: int = 3,
         **kwargs,
     ) -> None:
         self.index = index
@@ -195,6 +227,9 @@ class ElasticReader(AbstractReader):
             verify_certs=verify_certs,
             http_compress=http_compress,
             api_key=api_key,
+            request_timeout=request_timeout,
+            retry_on_timeout=True,
+            max_retries=max_retries,
         )
         if not verify_certs:

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/adapter/text.py RENAMED Viewed

@@ -30,9 +30,9 @@ class DefaultMissing(dict):
     Example:
         >>> d = DefaultMissing({"foo": "bar"})
-        >>> d['foo']
+        >>> d["foo"]
         'bar'
-        >>> d['missing_key']
+        >>> d["missing_key"]
         '{missing_key}'
     """

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/tools/rdump.py RENAMED Viewed

@@ -21,6 +21,14 @@ try:
 except ImportError:
     version = "unknown"
+try:
+    import tqdm
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
 log = logging.getLogger(__name__)
@@ -112,6 +120,12 @@ def main(argv: list[str] | None = None) -> int:
         help="Generate suffixes of length LEN for splitted output files",
     )
     output.add_argument("--multi-timestamp", action="store_true", help="Create records for datetime fields")
+    output.add_argument(
+        "-p",
+        "--progress",
+        action="store_true",
+        help="Show progress bar (requires tqdm)",
+    )
     advanced = parser.add_argument_group("advanced")
     advanced.add_argument(
@@ -217,7 +231,14 @@ def main(argv: list[str] | None = None) -> int:
     seen_desc = set()
     islice_stop = (args.count + args.skip) if args.count else None
     record_iterator = islice(record_stream(args.src, selector), args.skip, islice_stop)
+    if args.progress:
+        if not HAS_TQDM:
+            parser.error("tqdm is required for progress bar")
+        record_iterator = tqdm.tqdm(record_iterator, unit=" records", delay=sys.float_info.min)
     count = 0
+    record_writer = None
     try:
         record_writer = RecordWriter(uri)
@@ -246,7 +267,8 @@ def main(argv: list[str] | None = None) -> int:
                     record_writer.write(rec)
     finally:
-        record_writer.__exit__()
+        if record_writer:
+            record_writer.__exit__()
     if args.list:
         print(f"Processed {count} records")

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow/record/version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '3.19.dev8'
-__version_tuple__ = version_tuple = (3, 19, 'dev8')
+__version__ = version = '3.20.dev1'
+__version_tuple__ = version_tuple = (3, 20, 'dev1')

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow.record.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: flow.record
-Version: 3.19.dev8
+Version: 3.20.dev1
 Summary: A library for defining and creating structured data (called records) that can be streamed to disk or piped to other tools that use flow.record
 Author-email: Dissect Team <dissect@fox-it.com>
 License: Affero General Public License v3
@@ -43,6 +43,10 @@ Requires-Dist: flow.record[avro]; extra == "test"
 Requires-Dist: flow.record[elastic]; extra == "test"
 Requires-Dist: duckdb; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
 Requires-Dist: pytz; (platform_python_implementation != "PyPy" and python_version < "3.12") and extra == "test"
+Requires-Dist: tqdm; extra == "test"
+Provides-Extra: full
+Requires-Dist: flow.record[compression]; extra == "full"
+Requires-Dist: tqdm; extra == "full"
 # flow.record

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/flow.record.egg-info/requires.txt RENAMED Viewed

@@ -20,6 +20,10 @@ pytz
 [elastic]
 elasticsearch
+[full]
+flow.record[compression]
+tqdm
 [geoip]
 maxminddb
@@ -30,6 +34,7 @@ httpx
 flow.record[compression]
 flow.record[avro]
 flow.record[elastic]
+tqdm
 [test:platform_python_implementation != "PyPy" and python_version < "3.12"]
 duckdb

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/pyproject.toml RENAMED Viewed

@@ -62,6 +62,11 @@ test = [
     "flow.record[elastic]",
     "duckdb; platform_python_implementation != 'PyPy' and python_version < '3.12'", # duckdb
     "pytz; platform_python_implementation != 'PyPy' and python_version < '3.12'", # duckdb
+    "tqdm",
+]
+full = [
+    "flow.record[compression]",
+    "tqdm",
 ]
 [project.scripts]

{flow_record-3.19.dev8 → flow_record-3.20.dev1}/tests/test_rdump.py RENAMED Viewed

@@ -696,3 +696,29 @@ def test_rdump_line_verbose(tmp_path: Path, capsys: pytest.CaptureFixture, rdump
     assert "data (bytes) =" in captured.out
     assert "counter (uint32) =" in captured.out
     assert "foo (string) =" in captured.out
+def test_rdump_list_progress(tmp_path: Path, capsys: pytest.CaptureFixture) -> None:
+    TestRecord = RecordDescriptor(
+        "test/rdump/progress",
+        [
+            ("uint32", "counter"),
+        ],
+    )
+    record_path = tmp_path / "test.records"
+    with RecordWriter(record_path) as writer:
+        for i in range(100):
+            writer.write(TestRecord(counter=i))
+    rdump.main(["--list", "--progress", str(record_path)])
+    captured = capsys.readouterr()
+    # stderr should contain tqdm progress bar
+    #   100 records [00:00, 64987.67 records/s]
+    assert "\r100 records [" in captured.err
+    assert " records/s]" in captured.err
+    # stdout should contain the RecordDescriptor definition and count
+    assert "# <RecordDescriptor test/rdump/progress, hash=eeb21156>" in captured.out
+    assert "Processed 100 records" in captured.out