starfish-events 3.0.0a42__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- starfish_events-3.0.0a42/PKG-INFO +14 -0
- starfish_events-3.0.0a42/README.md +88 -0
- starfish_events-3.0.0a42/pyproject.toml +37 -0
- starfish_events-3.0.0a42/setup.cfg +4 -0
- starfish_events-3.0.0a42/starfish_events/__init__.py +33 -0
- starfish_events-3.0.0a42/starfish_events/encode.py +63 -0
- starfish_events-3.0.0a42/starfish_events/plugin.py +190 -0
- starfish_events-3.0.0a42/starfish_events.egg-info/PKG-INFO +14 -0
- starfish_events-3.0.0a42/starfish_events.egg-info/SOURCES.txt +13 -0
- starfish_events-3.0.0a42/starfish_events.egg-info/dependency_links.txt +1 -0
- starfish_events-3.0.0a42/starfish_events.egg-info/requires.txt +10 -0
- starfish_events-3.0.0a42/starfish_events.egg-info/top_level.txt +1 -0
- starfish_events-3.0.0a42/tests/test_cross_language.py +223 -0
- starfish_events-3.0.0a42/tests/test_encode.py +217 -0
- starfish_events-3.0.0a42/tests/test_integration.py +439 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: starfish-events
|
|
3
|
+
Version: 3.0.0a42
|
|
4
|
+
Summary: Starfish server plugin that intercepts JSON event-batch pushes and encodes them as Parquet on S3 (Python mirror of @drakkar.software/starfish-events)
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: starfish-protocol
|
|
7
|
+
Requires-Dist: starfish-server
|
|
8
|
+
Requires-Dist: pyarrow>=24.0
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
11
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
12
|
+
Requires-Dist: httpx>=0.25.0; extra == "dev"
|
|
13
|
+
Requires-Dist: fastapi>=0.100; extra == "dev"
|
|
14
|
+
Requires-Dist: pyarrow>=24.0; extra == "dev"
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# starfish-events
|
|
2
|
+
|
|
3
|
+
Starfish server plugin that intercepts JSON event-batch pushes and encodes them as
|
|
4
|
+
[Apache Parquet](https://parquet.apache.org/) files written directly to the object
|
|
5
|
+
store (typically S3).
|
|
6
|
+
|
|
7
|
+
Mirrors [`@drakkar.software/starfish-events`](../ts/events) (TypeScript) with
|
|
8
|
+
identical Parquet encoding — both are locked to the same test vectors.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install starfish-events
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## How it works
|
|
17
|
+
|
|
18
|
+
1. Register a JSON-typed collection (`allowed_mime_types: ["application/json"]`) in
|
|
19
|
+
your `SyncConfig`.
|
|
20
|
+
2. Attach `create_events_server_plugin` to `create_sync_router`.
|
|
21
|
+
3. Each push to that collection is intercepted: the JSON event batch is encoded as
|
|
22
|
+
Parquet and written via `store.put_bytes`. The default JSON document write is
|
|
23
|
+
short-circuited — **no JSON is persisted alongside the Parquet**.
|
|
24
|
+
|
|
25
|
+
One Parquet file is written per push (one file per batch). DuckDB's
|
|
26
|
+
`read_parquet('s3://…/**/*.parquet')` glob treats all files under the prefix as one
|
|
27
|
+
logical dataset.
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from starfish_server.storage.s3 import S3ObjectStore
|
|
33
|
+
from starfish_server.router.sync_router import create_sync_router
|
|
34
|
+
from starfish_protocol.config import SyncConfig, CollectionConfig, SyncRouterOptions
|
|
35
|
+
from starfish_events import create_events_server_plugin
|
|
36
|
+
|
|
37
|
+
store = S3ObjectStore(...)
|
|
38
|
+
|
|
39
|
+
plugin = create_events_server_plugin(
|
|
40
|
+
store=store,
|
|
41
|
+
collection="events",
|
|
42
|
+
storage_path="events/{app}/{batchId}",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
router = create_sync_router(
|
|
46
|
+
SyncRouterOptions(
|
|
47
|
+
store=store,
|
|
48
|
+
config=SyncConfig(
|
|
49
|
+
version=1,
|
|
50
|
+
collections=[
|
|
51
|
+
CollectionConfig(
|
|
52
|
+
name="events",
|
|
53
|
+
storage_path="events/{app}/{batchId}",
|
|
54
|
+
read_roles=["admin"],
|
|
55
|
+
write_roles=["public"],
|
|
56
|
+
encryption="none",
|
|
57
|
+
allowed_mime_types=["application/json"], # JSON-typed, not Parquet
|
|
58
|
+
max_body_bytes=8_000_000,
|
|
59
|
+
)
|
|
60
|
+
],
|
|
61
|
+
),
|
|
62
|
+
plugins=[plugin],
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## API
|
|
68
|
+
|
|
69
|
+
### `create_events_server_plugin(*, store, collection, storage_path) -> ServerPlugin`
|
|
70
|
+
|
|
71
|
+
| Parameter | Type | Description |
|
|
72
|
+
|---|---|---|
|
|
73
|
+
| `store` | `AbstractObjectStore` | Object store with `put_bytes`. Pass the same instance as `create_sync_router`. |
|
|
74
|
+
| `collection` | `str` | Name of the collection to intercept (e.g. `"events"`). |
|
|
75
|
+
| `storage_path` | `str` | Storage-path template for the Parquet key. Supports `{param}` placeholders from the push URL. The `.parquet` extension is appended automatically if absent. |
|
|
76
|
+
|
|
77
|
+
The plugin adds `received_at` (ISO-8601 UTC) to every event row before encoding.
|
|
78
|
+
|
|
79
|
+
## Querying with DuckDB
|
|
80
|
+
|
|
81
|
+
```sql
|
|
82
|
+
SELECT event_type, COUNT(*) AS n
|
|
83
|
+
FROM read_parquet('s3://my-bucket/events/myapp/**/*.parquet')
|
|
84
|
+
GROUP BY event_type
|
|
85
|
+
ORDER BY n DESC;
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
See [Analytics — Events & Parquet](/analytics/events) for the full guide.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "starfish-events"
|
|
7
|
+
version = "3.0.0a42"
|
|
8
|
+
description = "Starfish server plugin that intercepts JSON event-batch pushes and encodes them as Parquet on S3 (Python mirror of @drakkar.software/starfish-events)"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"starfish-protocol",
|
|
12
|
+
"starfish-server",
|
|
13
|
+
# pyarrow is the ONLY heavy dep and lives here exclusively (the Python analog
|
|
14
|
+
# of hyparquet-writer living only in packages/ts/events). >=10.0 covers the
|
|
15
|
+
# full stable API surface used here (pa.table, pa.array, pq.write_table,
|
|
16
|
+
# io.BytesIO); tested against 21.x.
|
|
17
|
+
"pyarrow>=24.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
dev = [
|
|
22
|
+
"pytest>=7.0",
|
|
23
|
+
"pytest-asyncio>=0.21",
|
|
24
|
+
"httpx>=0.25.0",
|
|
25
|
+
"fastapi>=0.100",
|
|
26
|
+
# pyarrow is a runtime dep but we list it here too so `uv sync --extra dev`
|
|
27
|
+
# installs it for the round-trip decode assertions in tests.
|
|
28
|
+
"pyarrow>=24.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[tool.uv.sources]
|
|
32
|
+
starfish-protocol = { path = "../protocol", editable = true }
|
|
33
|
+
starfish-server = { path = "../server", editable = true }
|
|
34
|
+
|
|
35
|
+
[tool.pytest.ini_options]
|
|
36
|
+
asyncio_mode = "auto"
|
|
37
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""``starfish-events`` — Starfish server plugin (Python) that intercepts JSON
|
|
2
|
+
event-batch pushes from the SunGlasses adapter and encodes them as Parquet
|
|
3
|
+
files on S3.
|
|
4
|
+
|
|
5
|
+
Python mirror of ``@drakkar.software/starfish-events`` (``packages/ts/events``).
|
|
6
|
+
|
|
7
|
+
Public surface
|
|
8
|
+
--------------
|
|
9
|
+
- :func:`create_events_server_plugin` — factory that returns a
|
|
10
|
+
:class:`~starfish_protocol.plugins.ServerPlugin` whose ``intercept_push``
|
|
11
|
+
hook encodes event batches as Parquet.
|
|
12
|
+
- :func:`encode_parquet` — low-level encoder (exposed for testing and direct
|
|
13
|
+
use).
|
|
14
|
+
- :data:`COLUMNS` — the fixed 10-column tuple that forms the Parquet schema.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from starfish_events.encode import COLUMNS, encode_parquet
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def __getattr__(name: str):
|
|
21
|
+
"""Lazy import of :func:`create_events_server_plugin`.
|
|
22
|
+
|
|
23
|
+
Keeps ``starfish_server`` off the hot path for callers that only import the
|
|
24
|
+
types or the encoder — mirrors the pattern used in ``starfish_projection``.
|
|
25
|
+
"""
|
|
26
|
+
if name == "create_events_server_plugin":
|
|
27
|
+
from starfish_events.plugin import create_events_server_plugin as _f
|
|
28
|
+
|
|
29
|
+
return _f
|
|
30
|
+
raise AttributeError(f"module 'starfish_events' has no attribute {name!r}")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
__all__ = ["COLUMNS", "encode_parquet", "create_events_server_plugin"]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Parquet encoding for SunGlasses event rows.
|
|
2
|
+
|
|
3
|
+
Column schema matches the ``EventRow`` produced by ``apps/ingest-server/src/schema.ts``
|
|
4
|
+
so DuckDB queries are identical regardless of which backend delivered the data
|
|
5
|
+
(HTTP ingest server vs. this Starfish-events plugin).
|
|
6
|
+
|
|
7
|
+
All columns are STRING (VARCHAR in Parquet terms). ``UNCOMPRESSED`` codec
|
|
8
|
+
avoids a native/WASM compressor dependency and matches the TypeScript side
|
|
9
|
+
(``packages/ts/events/src/encode.ts``).
|
|
10
|
+
|
|
11
|
+
Privacy: the caller is responsible for never logging the contents of
|
|
12
|
+
``distinct_id``, ``properties``, or ``context``. This module stores whatever
|
|
13
|
+
it receives opaquely.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import io
|
|
18
|
+
|
|
19
|
+
import pyarrow as pa
|
|
20
|
+
import pyarrow.parquet as pq
|
|
21
|
+
|
|
22
|
+
# Fixed column order — mirrors apps/ingest-server EventRow and
|
|
23
|
+
# packages/ts/events/src/encode.ts. The order is part of the schema contract
|
|
24
|
+
# consumed by DuckDB; do not reorder.
|
|
25
|
+
COLUMNS: tuple[str, ...] = (
|
|
26
|
+
"event_type",
|
|
27
|
+
"event",
|
|
28
|
+
"distinct_id",
|
|
29
|
+
"anonymous_id",
|
|
30
|
+
"ts",
|
|
31
|
+
"message_id",
|
|
32
|
+
"properties",
|
|
33
|
+
"context",
|
|
34
|
+
"dt",
|
|
35
|
+
"received_at",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def encode_parquet(rows: list[dict]) -> bytes:
|
|
40
|
+
"""Encode a list of flat event-row dicts as an Apache Parquet byte string.
|
|
41
|
+
|
|
42
|
+
All values are coerced to :class:`str` so integers, floats, ``None``, and
|
|
43
|
+
other JSON types are stored as strings rather than causing a type error.
|
|
44
|
+
Missing keys default to ``""`` (empty string).
|
|
45
|
+
|
|
46
|
+
:param rows: List of row dicts; each row is one SunGlasses event as
|
|
47
|
+
flattened by the adapter's ``toStarfishRow`` (or an equivalent mapper).
|
|
48
|
+
:returns: Raw Parquet bytes starting (and ending) with the ``PAR1`` magic.
|
|
49
|
+
:raises Exception: Propagated from pyarrow on encoding failure; the plugin
|
|
50
|
+
wraps this as an HTTP 500 so the client retries.
|
|
51
|
+
"""
|
|
52
|
+
table = pa.table(
|
|
53
|
+
{
|
|
54
|
+
col: pa.array(
|
|
55
|
+
[str(row[col]) if row.get(col) is not None else "" for row in rows],
|
|
56
|
+
type=pa.string(),
|
|
57
|
+
)
|
|
58
|
+
for col in COLUMNS
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
buf = io.BytesIO()
|
|
62
|
+
pq.write_table(table, buf, compression="none")
|
|
63
|
+
return buf.getvalue()
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Starfish server plugin: intercepts JSON event-batch pushes and encodes them
|
|
2
|
+
as Parquet files written directly to the object store (typically S3).
|
|
3
|
+
|
|
4
|
+
How it works
|
|
5
|
+
------------
|
|
6
|
+
1. Register a JSON-typed collection (``allowed_mime_types: ["application/json"]``)
|
|
7
|
+
with public write access.
|
|
8
|
+
2. Attach this plugin to the sync router.
|
|
9
|
+
3. Each push to that collection is intercepted here; the JSON event batch is
|
|
10
|
+
encoded as Parquet and stored via ``store.put_bytes``, short-circuiting the
|
|
11
|
+
default JSON document write so no JSON is persisted alongside the Parquet.
|
|
12
|
+
|
|
13
|
+
Collection requirement
|
|
14
|
+
-----------------------
|
|
15
|
+
The intercepted collection **must** be JSON-typed — ``intercept_push`` only
|
|
16
|
+
receives a populated ``raw_body`` for JSON collections (see
|
|
17
|
+
``route_builder.py:847``). A binary (Parquet-typed) collection would yield an
|
|
18
|
+
empty body.
|
|
19
|
+
|
|
20
|
+
One file per batch
|
|
21
|
+
------------------
|
|
22
|
+
Parquet's column-footer format makes in-place append impractical. Each
|
|
23
|
+
``send()`` call from the SunGlasses adapter writes a unique path (batchId in
|
|
24
|
+
the storage-path template). DuckDB's
|
|
25
|
+
``read_parquet('s3://…/**/*.parquet')`` glob treats all files under the prefix
|
|
26
|
+
as one logical dataset.
|
|
27
|
+
|
|
28
|
+
Privacy
|
|
29
|
+
-------
|
|
30
|
+
Never log ``distinct_id``, ``properties``, or ``context``. Log counts only.
|
|
31
|
+
These values ride as opaque strings into Parquet.
|
|
32
|
+
"""
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import hashlib
|
|
36
|
+
import json
|
|
37
|
+
import logging
|
|
38
|
+
from datetime import datetime, timezone
|
|
39
|
+
from typing import TYPE_CHECKING
|
|
40
|
+
|
|
41
|
+
from starfish_protocol.constants import PARQUET_MIME_TYPE
|
|
42
|
+
from starfish_protocol.plugins import PushHookContext, PushHookResult, ServerPlugin
|
|
43
|
+
from starfish_server.router.route_builder import resolve_document_key
|
|
44
|
+
|
|
45
|
+
from starfish_events.encode import encode_parquet
|
|
46
|
+
|
|
47
|
+
if TYPE_CHECKING:
|
|
48
|
+
from starfish_server.storage.base import AbstractObjectStore
|
|
49
|
+
|
|
50
|
+
_log = logging.getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _supports_binary(store: object) -> bool:
|
|
54
|
+
"""Return True when *store* has overridden ``put_bytes`` (supports binary writes).
|
|
55
|
+
|
|
56
|
+
The base :class:`~starfish_server.storage.base.AbstractObjectStore` raises
|
|
57
|
+
``NotImplementedError`` on ``put_bytes``. Any concrete implementation
|
|
58
|
+
(``S3ObjectStore``, ``FilesystemObjectStore``, ``MemoryObjectStore``) overrides
|
|
59
|
+
it and returns ``True`` here.
|
|
60
|
+
"""
|
|
61
|
+
from starfish_server.storage.base import AbstractObjectStore as _Base
|
|
62
|
+
|
|
63
|
+
return type(store).put_bytes is not _Base.put_bytes
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def create_events_server_plugin(
|
|
67
|
+
*,
|
|
68
|
+
store: "AbstractObjectStore",
|
|
69
|
+
collection: str,
|
|
70
|
+
storage_path: str,
|
|
71
|
+
) -> ServerPlugin:
|
|
72
|
+
"""Build a :class:`~starfish_protocol.plugins.ServerPlugin` that encodes
|
|
73
|
+
SunGlasses event batches as Parquet and writes them to the object store.
|
|
74
|
+
|
|
75
|
+
:param store: Object store the plugin writes Parquet files to. Must
|
|
76
|
+
implement ``put_bytes`` (e.g. ``S3ObjectStore`` or ``MemoryObjectStore``).
|
|
77
|
+
Pass the **same** store instance you pass to ``create_sync_router``.
|
|
78
|
+
:param collection: Name of the collection to intercept. Must match the
|
|
79
|
+
``name`` field in the ``SyncConfig.collections`` entry.
|
|
80
|
+
Example: ``"events"``.
|
|
81
|
+
:param storage_path: Storage-path template for the output Parquet key.
|
|
82
|
+
Supports ``{param}`` placeholders resolved from the push URL's path
|
|
83
|
+
params. Example: ``"events/{app}/{batchId}"`` →
|
|
84
|
+
``"events/myapp/<uuid>"``. The plugin appends ``.parquet`` when the
|
|
85
|
+
resolved key does not already end with it.
|
|
86
|
+
:raises TypeError: When *store* does not override ``put_bytes``.
|
|
87
|
+
|
|
88
|
+
Example wiring::
|
|
89
|
+
|
|
90
|
+
from starfish_server.storage.s3 import S3ObjectStore
|
|
91
|
+
from starfish_events import create_events_server_plugin
|
|
92
|
+
|
|
93
|
+
store = S3ObjectStore(...)
|
|
94
|
+
plugin = create_events_server_plugin(
|
|
95
|
+
store=store,
|
|
96
|
+
collection="events",
|
|
97
|
+
storage_path="events/{app}/{batchId}",
|
|
98
|
+
)
|
|
99
|
+
router = create_sync_router(
|
|
100
|
+
SyncRouterOptions(
|
|
101
|
+
store=store,
|
|
102
|
+
config=SyncConfig(
|
|
103
|
+
version=1,
|
|
104
|
+
collections=[
|
|
105
|
+
CollectionConfig(
|
|
106
|
+
name="events",
|
|
107
|
+
storage_path="events/{app}/{batchId}",
|
|
108
|
+
read_roles=["admin"],
|
|
109
|
+
write_roles=["public"],
|
|
110
|
+
encryption="none",
|
|
111
|
+
allowed_mime_types=["application/json"], # JSON-typed!
|
|
112
|
+
max_body_bytes=8_000_000,
|
|
113
|
+
)
|
|
114
|
+
],
|
|
115
|
+
),
|
|
116
|
+
plugins=[plugin],
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
"""
|
|
120
|
+
if not _supports_binary(store):
|
|
121
|
+
raise TypeError(
|
|
122
|
+
"[starfish-events] the provided store does not implement put_bytes "
|
|
123
|
+
"(binary writes). Use S3ObjectStore, FilesystemObjectStore, or "
|
|
124
|
+
"MemoryObjectStore."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
async def _intercept_push(ctx: PushHookContext) -> PushHookResult:
|
|
128
|
+
# Only intercept the configured collection; let everything else proceed.
|
|
129
|
+
if ctx.collection != collection:
|
|
130
|
+
return PushHookResult(action="proceed")
|
|
131
|
+
|
|
132
|
+
# Parse the push envelope: { data: { events: [...] }, baseHash }
|
|
133
|
+
try:
|
|
134
|
+
envelope = json.loads(ctx.raw_body)
|
|
135
|
+
data = (envelope.get("data") or {}) if isinstance(envelope, dict) else {}
|
|
136
|
+
raw = data.get("events") if isinstance(data, dict) else None
|
|
137
|
+
events: list[dict] = raw if isinstance(raw, list) else []
|
|
138
|
+
except (json.JSONDecodeError, AttributeError, ValueError):
|
|
139
|
+
return PushHookResult(
|
|
140
|
+
action="reject",
|
|
141
|
+
status=400,
|
|
142
|
+
error="Invalid JSON body — expected { data: { events: [...] }, baseHash }",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Stamp ingest time server-side. Never log event contents.
|
|
146
|
+
dt_now = datetime.now(timezone.utc)
|
|
147
|
+
received_at = (
|
|
148
|
+
dt_now.strftime("%Y-%m-%dT%H:%M:%S.")
|
|
149
|
+
+ f"{dt_now.microsecond // 1000:03d}Z"
|
|
150
|
+
)
|
|
151
|
+
rows = [{**e, "received_at": received_at} for e in events]
|
|
152
|
+
|
|
153
|
+
# Encode to Parquet.
|
|
154
|
+
try:
|
|
155
|
+
parquet_bytes = encode_parquet(rows)
|
|
156
|
+
except Exception as exc:
|
|
157
|
+
_log.error("[starfish-events] Parquet encoding failed: %s", exc)
|
|
158
|
+
return PushHookResult(action="reject", status=500, error="Parquet encoding failed")
|
|
159
|
+
|
|
160
|
+
# Resolve the output key from the storagePath template + URL params.
|
|
161
|
+
key = resolve_document_key(storage_path, dict(ctx.params))
|
|
162
|
+
if not key.endswith(".parquet"):
|
|
163
|
+
key += ".parquet"
|
|
164
|
+
|
|
165
|
+
# Write to the object store. On failure return a clean HTTP 500 so the
|
|
166
|
+
# SunGlasses adapter sees a non-2xx and the SDK requeues the batch
|
|
167
|
+
# (at-least-once delivery guarantee).
|
|
168
|
+
try:
|
|
169
|
+
await store.put_bytes(key, parquet_bytes, content_type=PARQUET_MIME_TYPE)
|
|
170
|
+
except Exception as exc:
|
|
171
|
+
_log.error("[starfish-events] put_bytes failed for key %r: %s", key, exc)
|
|
172
|
+
return PushHookResult(action="reject", status=500, error="Storage write failed")
|
|
173
|
+
|
|
174
|
+
# Compute SHA-256 to match the binary push response format.
|
|
175
|
+
sha = hashlib.sha256(parquet_bytes).hexdigest()
|
|
176
|
+
|
|
177
|
+
# Privacy: log only counts, never event contents.
|
|
178
|
+
_log.info(
|
|
179
|
+
"[starfish-events] wrote %d event(s) → %s (%d bytes)",
|
|
180
|
+
len(events),
|
|
181
|
+
key,
|
|
182
|
+
len(parquet_bytes),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
return PushHookResult(action="respond", status=200, body={"hash": sha})
|
|
186
|
+
|
|
187
|
+
return ServerPlugin(name="starfish-events", intercept_push=_intercept_push)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
__all__ = ["create_events_server_plugin"]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: starfish-events
|
|
3
|
+
Version: 3.0.0a42
|
|
4
|
+
Summary: Starfish server plugin that intercepts JSON event-batch pushes and encodes them as Parquet on S3 (Python mirror of @drakkar.software/starfish-events)
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: starfish-protocol
|
|
7
|
+
Requires-Dist: starfish-server
|
|
8
|
+
Requires-Dist: pyarrow>=24.0
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
11
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
12
|
+
Requires-Dist: httpx>=0.25.0; extra == "dev"
|
|
13
|
+
Requires-Dist: fastapi>=0.100; extra == "dev"
|
|
14
|
+
Requires-Dist: pyarrow>=24.0; extra == "dev"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
starfish_events/__init__.py
|
|
4
|
+
starfish_events/encode.py
|
|
5
|
+
starfish_events/plugin.py
|
|
6
|
+
starfish_events.egg-info/PKG-INFO
|
|
7
|
+
starfish_events.egg-info/SOURCES.txt
|
|
8
|
+
starfish_events.egg-info/dependency_links.txt
|
|
9
|
+
starfish_events.egg-info/requires.txt
|
|
10
|
+
starfish_events.egg-info/top_level.txt
|
|
11
|
+
tests/test_cross_language.py
|
|
12
|
+
tests/test_encode.py
|
|
13
|
+
tests/test_integration.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
starfish_events
|