starfish-events 3.0.0a42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: starfish-events
3
+ Version: 3.0.0a42
4
+ Summary: Starfish server plugin that intercepts JSON event-batch pushes and encodes them as Parquet on S3 (Python mirror of @drakkar.software/starfish-events)
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: starfish-protocol
7
+ Requires-Dist: starfish-server
8
+ Requires-Dist: pyarrow>=24.0
9
+ Provides-Extra: dev
10
+ Requires-Dist: pytest>=7.0; extra == "dev"
11
+ Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
12
+ Requires-Dist: httpx>=0.25.0; extra == "dev"
13
+ Requires-Dist: fastapi>=0.100; extra == "dev"
14
+ Requires-Dist: pyarrow>=24.0; extra == "dev"
@@ -0,0 +1,88 @@
1
+ # starfish-events
2
+
3
+ Starfish server plugin that intercepts JSON event-batch pushes and encodes them as
4
+ [Apache Parquet](https://parquet.apache.org/) files written directly to the object
5
+ store (typically S3).
6
+
7
+ Mirrors [`@drakkar.software/starfish-events`](../ts/events) (TypeScript) with
8
+ identical Parquet encoding — both are locked to the same test vectors.
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ pip install starfish-events
14
+ ```
15
+
16
+ ## How it works
17
+
18
+ 1. Register a JSON-typed collection (`allowed_mime_types: ["application/json"]`) in
19
+ your `SyncConfig`.
20
+ 2. Attach `create_events_server_plugin` to `create_sync_router`.
21
+ 3. Each push to that collection is intercepted: the JSON event batch is encoded as
22
+ Parquet and written via `store.put_bytes`. The default JSON document write is
23
+ short-circuited — **no JSON is persisted alongside the Parquet**.
24
+
25
+ One Parquet file is written per push (one file per batch). DuckDB's
26
+ `read_parquet('s3://…/**/*.parquet')` glob treats all files under the prefix as one
27
+ logical dataset.
28
+
29
+ ## Usage
30
+
31
+ ```python
32
+ from starfish_server.storage.s3 import S3ObjectStore
33
+ from starfish_server.router.sync_router import create_sync_router
34
+ from starfish_protocol.config import SyncConfig, CollectionConfig, SyncRouterOptions
35
+ from starfish_events import create_events_server_plugin
36
+
37
+ store = S3ObjectStore(...)
38
+
39
+ plugin = create_events_server_plugin(
40
+ store=store,
41
+ collection="events",
42
+ storage_path="events/{app}/{batchId}",
43
+ )
44
+
45
+ router = create_sync_router(
46
+ SyncRouterOptions(
47
+ store=store,
48
+ config=SyncConfig(
49
+ version=1,
50
+ collections=[
51
+ CollectionConfig(
52
+ name="events",
53
+ storage_path="events/{app}/{batchId}",
54
+ read_roles=["admin"],
55
+ write_roles=["public"],
56
+ encryption="none",
57
+ allowed_mime_types=["application/json"], # JSON-typed, not Parquet
58
+ max_body_bytes=8_000_000,
59
+ )
60
+ ],
61
+ ),
62
+ plugins=[plugin],
63
+ )
64
+ )
65
+ ```
66
+
67
+ ## API
68
+
69
+ ### `create_events_server_plugin(*, store, collection, storage_path) -> ServerPlugin`
70
+
71
+ | Parameter | Type | Description |
72
+ |---|---|---|
73
+ | `store` | `AbstractObjectStore` | Object store with `put_bytes`. Pass the same instance as `create_sync_router`. |
74
+ | `collection` | `str` | Name of the collection to intercept (e.g. `"events"`). |
75
+ | `storage_path` | `str` | Storage-path template for the Parquet key. Supports `{param}` placeholders from the push URL. The `.parquet` extension is appended automatically if absent. |
76
+
77
+ The plugin adds `received_at` (ISO-8601 UTC) to every event row before encoding.
78
+
79
+ ## Querying with DuckDB
80
+
81
+ ```sql
82
+ SELECT event_type, COUNT(*) AS n
83
+ FROM read_parquet('s3://my-bucket/events/myapp/**/*.parquet')
84
+ GROUP BY event_type
85
+ ORDER BY n DESC;
86
+ ```
87
+
88
+ See [Analytics — Events & Parquet](/analytics/events) for the full guide.
@@ -0,0 +1,37 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "starfish-events"
7
+ version = "3.0.0a42"
8
+ description = "Starfish server plugin that intercepts JSON event-batch pushes and encodes them as Parquet on S3 (Python mirror of @drakkar.software/starfish-events)"
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "starfish-protocol",
12
+ "starfish-server",
13
+ # pyarrow is the ONLY heavy dep and lives here exclusively (the Python analog
14
+ # of hyparquet-writer living only in packages/ts/events). >=10.0 covers the
15
+ # full stable API surface used here (pa.table, pa.array, pq.write_table,
16
+ # io.BytesIO); tested against 21.x.
17
+ "pyarrow>=24.0",
18
+ ]
19
+
20
+ [project.optional-dependencies]
21
+ dev = [
22
+ "pytest>=7.0",
23
+ "pytest-asyncio>=0.21",
24
+ "httpx>=0.25.0",
25
+ "fastapi>=0.100",
26
+ # pyarrow is a runtime dep but we list it here too so `uv sync --extra dev`
27
+ # installs it for the round-trip decode assertions in tests.
28
+ "pyarrow>=24.0",
29
+ ]
30
+
31
+ [tool.uv.sources]
32
+ starfish-protocol = { path = "../protocol", editable = true }
33
+ starfish-server = { path = "../server", editable = true }
34
+
35
+ [tool.pytest.ini_options]
36
+ asyncio_mode = "auto"
37
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,33 @@
1
+ """``starfish-events`` — Starfish server plugin (Python) that intercepts JSON
2
+ event-batch pushes from the SunGlasses adapter and encodes them as Parquet
3
+ files on S3.
4
+
5
+ Python mirror of ``@drakkar.software/starfish-events`` (``packages/ts/events``).
6
+
7
+ Public surface
8
+ --------------
9
+ - :func:`create_events_server_plugin` — factory that returns a
10
+ :class:`~starfish_protocol.plugins.ServerPlugin` whose ``intercept_push``
11
+ hook encodes event batches as Parquet.
12
+ - :func:`encode_parquet` — low-level encoder (exposed for testing and direct
13
+ use).
14
+ - :data:`COLUMNS` — the fixed 10-column tuple that forms the Parquet schema.
15
+ """
16
+
17
+ from starfish_events.encode import COLUMNS, encode_parquet
18
+
19
+
20
+ def __getattr__(name: str):
21
+ """Lazy import of :func:`create_events_server_plugin`.
22
+
23
+ Keeps ``starfish_server`` off the hot path for callers that only import the
24
+ types or the encoder — mirrors the pattern used in ``starfish_projection``.
25
+ """
26
+ if name == "create_events_server_plugin":
27
+ from starfish_events.plugin import create_events_server_plugin as _f
28
+
29
+ return _f
30
+ raise AttributeError(f"module 'starfish_events' has no attribute {name!r}")
31
+
32
+
33
+ __all__ = ["COLUMNS", "encode_parquet", "create_events_server_plugin"]
@@ -0,0 +1,63 @@
1
+ """Parquet encoding for SunGlasses event rows.
2
+
3
+ Column schema matches the ``EventRow`` produced by ``apps/ingest-server/src/schema.ts``
4
+ so DuckDB queries are identical regardless of which backend delivered the data
5
+ (HTTP ingest server vs. this Starfish-events plugin).
6
+
7
+ All columns are STRING (VARCHAR in Parquet terms). ``UNCOMPRESSED`` codec
8
+ avoids a native/WASM compressor dependency and matches the TypeScript side
9
+ (``packages/ts/events/src/encode.ts``).
10
+
11
+ Privacy: the caller is responsible for never logging the contents of
12
+ ``distinct_id``, ``properties``, or ``context``. This module stores whatever
13
+ it receives opaquely.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import io
18
+
19
+ import pyarrow as pa
20
+ import pyarrow.parquet as pq
21
+
22
+ # Fixed column order — mirrors apps/ingest-server EventRow and
23
+ # packages/ts/events/src/encode.ts. The order is part of the schema contract
24
+ # consumed by DuckDB; do not reorder.
25
+ COLUMNS: tuple[str, ...] = (
26
+ "event_type",
27
+ "event",
28
+ "distinct_id",
29
+ "anonymous_id",
30
+ "ts",
31
+ "message_id",
32
+ "properties",
33
+ "context",
34
+ "dt",
35
+ "received_at",
36
+ )
37
+
38
+
39
+ def encode_parquet(rows: list[dict]) -> bytes:
40
+ """Encode a list of flat event-row dicts as an Apache Parquet byte string.
41
+
42
+ All values are coerced to :class:`str` so integers, floats, ``None``, and
43
+ other JSON types are stored as strings rather than causing a type error.
44
+ Missing keys default to ``""`` (empty string).
45
+
46
+ :param rows: List of row dicts; each row is one SunGlasses event as
47
+ flattened by the adapter's ``toStarfishRow`` (or an equivalent mapper).
48
+ :returns: Raw Parquet bytes starting (and ending) with the ``PAR1`` magic.
49
+ :raises Exception: Propagated from pyarrow on encoding failure; the plugin
50
+ wraps this as an HTTP 500 so the client retries.
51
+ """
52
+ table = pa.table(
53
+ {
54
+ col: pa.array(
55
+ [str(row[col]) if row.get(col) is not None else "" for row in rows],
56
+ type=pa.string(),
57
+ )
58
+ for col in COLUMNS
59
+ }
60
+ )
61
+ buf = io.BytesIO()
62
+ pq.write_table(table, buf, compression="none")
63
+ return buf.getvalue()
@@ -0,0 +1,190 @@
1
+ """Starfish server plugin: intercepts JSON event-batch pushes and encodes them
2
+ as Parquet files written directly to the object store (typically S3).
3
+
4
+ How it works
5
+ ------------
6
+ 1. Register a JSON-typed collection (``allowed_mime_types: ["application/json"]``)
7
+ with public write access.
8
+ 2. Attach this plugin to the sync router.
9
+ 3. Each push to that collection is intercepted here; the JSON event batch is
10
+ encoded as Parquet and stored via ``store.put_bytes``, short-circuiting the
11
+ default JSON document write so no JSON is persisted alongside the Parquet.
12
+
13
+ Collection requirement
14
+ -----------------------
15
+ The intercepted collection **must** be JSON-typed — ``intercept_push`` only
16
+ receives a populated ``raw_body`` for JSON collections (see
17
+ ``route_builder.py:847``). A binary (Parquet-typed) collection would yield an
18
+ empty body.
19
+
20
+ One file per batch
21
+ ------------------
22
+ Parquet's column-footer format makes in-place append impractical. Each
23
+ ``send()`` call from the SunGlasses adapter writes a unique path (batchId in
24
+ the storage-path template). DuckDB's
25
+ ``read_parquet('s3://…/**/*.parquet')`` glob treats all files under the prefix
26
+ as one logical dataset.
27
+
28
+ Privacy
29
+ -------
30
+ Never log ``distinct_id``, ``properties``, or ``context``. Log counts only.
31
+ These values ride as opaque strings into Parquet.
32
+ """
33
+ from __future__ import annotations
34
+
35
+ import hashlib
36
+ import json
37
+ import logging
38
+ from datetime import datetime, timezone
39
+ from typing import TYPE_CHECKING
40
+
41
+ from starfish_protocol.constants import PARQUET_MIME_TYPE
42
+ from starfish_protocol.plugins import PushHookContext, PushHookResult, ServerPlugin
43
+ from starfish_server.router.route_builder import resolve_document_key
44
+
45
+ from starfish_events.encode import encode_parquet
46
+
47
+ if TYPE_CHECKING:
48
+ from starfish_server.storage.base import AbstractObjectStore
49
+
50
+ _log = logging.getLogger(__name__)
51
+
52
+
53
+ def _supports_binary(store: object) -> bool:
54
+ """Return True when *store* has overridden ``put_bytes`` (supports binary writes).
55
+
56
+ The base :class:`~starfish_server.storage.base.AbstractObjectStore` raises
57
+ ``NotImplementedError`` on ``put_bytes``. Any concrete implementation
58
+ (``S3ObjectStore``, ``FilesystemObjectStore``, ``MemoryObjectStore``) overrides
59
+ it and returns ``True`` here.
60
+ """
61
+ from starfish_server.storage.base import AbstractObjectStore as _Base
62
+
63
+ return type(store).put_bytes is not _Base.put_bytes
64
+
65
+
66
+ def create_events_server_plugin(
67
+ *,
68
+ store: "AbstractObjectStore",
69
+ collection: str,
70
+ storage_path: str,
71
+ ) -> ServerPlugin:
72
+ """Build a :class:`~starfish_protocol.plugins.ServerPlugin` that encodes
73
+ SunGlasses event batches as Parquet and writes them to the object store.
74
+
75
+ :param store: Object store the plugin writes Parquet files to. Must
76
+ implement ``put_bytes`` (e.g. ``S3ObjectStore`` or ``MemoryObjectStore``).
77
+ Pass the **same** store instance you pass to ``create_sync_router``.
78
+ :param collection: Name of the collection to intercept. Must match the
79
+ ``name`` field in the ``SyncConfig.collections`` entry.
80
+ Example: ``"events"``.
81
+ :param storage_path: Storage-path template for the output Parquet key.
82
+ Supports ``{param}`` placeholders resolved from the push URL's path
83
+ params. Example: ``"events/{app}/{batchId}"`` →
84
+ ``"events/myapp/<uuid>"``. The plugin appends ``.parquet`` when the
85
+ resolved key does not already end with it.
86
+ :raises TypeError: When *store* does not override ``put_bytes``.
87
+
88
+ Example wiring::
89
+
90
+ from starfish_server.storage.s3 import S3ObjectStore
91
+ from starfish_events import create_events_server_plugin
92
+
93
+ store = S3ObjectStore(...)
94
+ plugin = create_events_server_plugin(
95
+ store=store,
96
+ collection="events",
97
+ storage_path="events/{app}/{batchId}",
98
+ )
99
+ router = create_sync_router(
100
+ SyncRouterOptions(
101
+ store=store,
102
+ config=SyncConfig(
103
+ version=1,
104
+ collections=[
105
+ CollectionConfig(
106
+ name="events",
107
+ storage_path="events/{app}/{batchId}",
108
+ read_roles=["admin"],
109
+ write_roles=["public"],
110
+ encryption="none",
111
+ allowed_mime_types=["application/json"], # JSON-typed!
112
+ max_body_bytes=8_000_000,
113
+ )
114
+ ],
115
+ ),
116
+ plugins=[plugin],
117
+ )
118
+ )
119
+ """
120
+ if not _supports_binary(store):
121
+ raise TypeError(
122
+ "[starfish-events] the provided store does not implement put_bytes "
123
+ "(binary writes). Use S3ObjectStore, FilesystemObjectStore, or "
124
+ "MemoryObjectStore."
125
+ )
126
+
127
+ async def _intercept_push(ctx: PushHookContext) -> PushHookResult:
128
+ # Only intercept the configured collection; let everything else proceed.
129
+ if ctx.collection != collection:
130
+ return PushHookResult(action="proceed")
131
+
132
+ # Parse the push envelope: { data: { events: [...] }, baseHash }
133
+ try:
134
+ envelope = json.loads(ctx.raw_body)
135
+ data = (envelope.get("data") or {}) if isinstance(envelope, dict) else {}
136
+ raw = data.get("events") if isinstance(data, dict) else None
137
+ events: list[dict] = raw if isinstance(raw, list) else []
138
+ except (json.JSONDecodeError, AttributeError, ValueError):
139
+ return PushHookResult(
140
+ action="reject",
141
+ status=400,
142
+ error="Invalid JSON body — expected { data: { events: [...] }, baseHash }",
143
+ )
144
+
145
+ # Stamp ingest time server-side. Never log event contents.
146
+ dt_now = datetime.now(timezone.utc)
147
+ received_at = (
148
+ dt_now.strftime("%Y-%m-%dT%H:%M:%S.")
149
+ + f"{dt_now.microsecond // 1000:03d}Z"
150
+ )
151
+ rows = [{**e, "received_at": received_at} for e in events]
152
+
153
+ # Encode to Parquet.
154
+ try:
155
+ parquet_bytes = encode_parquet(rows)
156
+ except Exception as exc:
157
+ _log.error("[starfish-events] Parquet encoding failed: %s", exc)
158
+ return PushHookResult(action="reject", status=500, error="Parquet encoding failed")
159
+
160
+ # Resolve the output key from the storagePath template + URL params.
161
+ key = resolve_document_key(storage_path, dict(ctx.params))
162
+ if not key.endswith(".parquet"):
163
+ key += ".parquet"
164
+
165
+ # Write to the object store. On failure return a clean HTTP 500 so the
166
+ # SunGlasses adapter sees a non-2xx and the SDK requeues the batch
167
+ # (at-least-once delivery guarantee).
168
+ try:
169
+ await store.put_bytes(key, parquet_bytes, content_type=PARQUET_MIME_TYPE)
170
+ except Exception as exc:
171
+ _log.error("[starfish-events] put_bytes failed for key %r: %s", key, exc)
172
+ return PushHookResult(action="reject", status=500, error="Storage write failed")
173
+
174
+ # Compute SHA-256 to match the binary push response format.
175
+ sha = hashlib.sha256(parquet_bytes).hexdigest()
176
+
177
+ # Privacy: log only counts, never event contents.
178
+ _log.info(
179
+ "[starfish-events] wrote %d event(s) → %s (%d bytes)",
180
+ len(events),
181
+ key,
182
+ len(parquet_bytes),
183
+ )
184
+
185
+ return PushHookResult(action="respond", status=200, body={"hash": sha})
186
+
187
+ return ServerPlugin(name="starfish-events", intercept_push=_intercept_push)
188
+
189
+
190
+ __all__ = ["create_events_server_plugin"]
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: starfish-events
3
+ Version: 3.0.0a42
4
+ Summary: Starfish server plugin that intercepts JSON event-batch pushes and encodes them as Parquet on S3 (Python mirror of @drakkar.software/starfish-events)
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: starfish-protocol
7
+ Requires-Dist: starfish-server
8
+ Requires-Dist: pyarrow>=24.0
9
+ Provides-Extra: dev
10
+ Requires-Dist: pytest>=7.0; extra == "dev"
11
+ Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
12
+ Requires-Dist: httpx>=0.25.0; extra == "dev"
13
+ Requires-Dist: fastapi>=0.100; extra == "dev"
14
+ Requires-Dist: pyarrow>=24.0; extra == "dev"
@@ -0,0 +1,13 @@
1
+ README.md
2
+ pyproject.toml
3
+ starfish_events/__init__.py
4
+ starfish_events/encode.py
5
+ starfish_events/plugin.py
6
+ starfish_events.egg-info/PKG-INFO
7
+ starfish_events.egg-info/SOURCES.txt
8
+ starfish_events.egg-info/dependency_links.txt
9
+ starfish_events.egg-info/requires.txt
10
+ starfish_events.egg-info/top_level.txt
11
+ tests/test_cross_language.py
12
+ tests/test_encode.py
13
+ tests/test_integration.py
@@ -0,0 +1,10 @@
1
+ starfish-protocol
2
+ starfish-server
3
+ pyarrow>=24.0
4
+
5
+ [dev]
6
+ pytest>=7.0
7
+ pytest-asyncio>=0.21
8
+ httpx>=0.25.0
9
+ fastapi>=0.100
10
+ pyarrow>=24.0
@@ -0,0 +1 @@
1
+ starfish_events