PyPI - flagsmith-sql-flag-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

flagsmith-sql-flag-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

flagsmith_sql_flag_engine/__init__.py +28 -0
flagsmith_sql_flag_engine/dialect.py +125 -0
flagsmith_sql_flag_engine/dialects/__init__.py +5 -0
flagsmith_sql_flag_engine/dialects/clickhouse.py +329 -0
flagsmith_sql_flag_engine/py.typed +0 -0
flagsmith_sql_flag_engine/translator.py +569 -0
flagsmith_sql_flag_engine/utils.py +87 -0
flagsmith_sql_flag_engine-0.1.0.dist-info/METADATA +153 -0
flagsmith_sql_flag_engine-0.1.0.dist-info/RECORD +10 -0
flagsmith_sql_flag_engine-0.1.0.dist-info/WHEEL +4 -0

flagsmith_sql_flag_engine/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""SQL translator for Flagsmith segment predicates.
+Public API:
+    translate_segment(segment, ctx) -> str | None
+    TranslateContext
+See README.md for usage. The translator is dialect-aware via the `Dialect`
+protocol; `flagsmith_sql_flag_engine.dialects.clickhouse.ClickHouseDialect`
+is the only implementation today.
+"""
+from flagsmith_sql_flag_engine.dialect import Dialect
+from flagsmith_sql_flag_engine.translator import (
+    TRANSLATABLE_OPERATORS,
+    TranslateContext,
+    translate_condition,
+    translate_rule,
+    translate_segment,
+)
+__all__ = [
+    "TRANSLATABLE_OPERATORS",
+    "Dialect",
+    "TranslateContext",
+    "translate_condition",
+    "translate_rule",
+    "translate_segment",
+]

flagsmith_sql_flag_engine/dialect.py ADDED Viewed

@@ -0,0 +1,125 @@
+"""Per-dialect SQL fragments — MD5 hex, hex-to-int parsing, prefix-anchored
+regex, padded-version comparison, type-aware trait predicates, regex flavour."""
+from typing import Protocol
+class Dialect(Protocol):
+    """Per-dialect SQL fragments.
+    Methods return SQL string fragments. Inputs are already-formatted SQL
+    strings (column refs, string literals); the dialect only chooses the
+    right syntax for the operation.
+    """
+    name: str  # human-readable, used in test ids and error messages
+    # --- IDENTITIES schema access ---
+    #
+    # The dialect owns the canonical IDENTITIES schema, see `schema_ddl`,
+    # so it also owns the SQL expression for each logical column. The
+    # translator just hands over an alias.
+    def identifier_expr(self, alias: str) -> str:
+        """SQL expression for `$.identity.identifier`."""
+        ...
+    def identity_key_expr(self, alias: str) -> str:
+        """SQL expression for `$.identity.key`."""
+        ...
+    def trait_path(self, alias: str, trait_key: str) -> str:
+        """Path-extract a trait value from the IDENTITIES traits container.
+        The path syntax varies by SQL engine.
+        """
+        ...
+    def trait_eq(self, alias: str, trait_key: str, value: object, negate: bool) -> str:
+        """Type-aware EQUAL / NOT_EQUAL predicate on a trait, mirroring
+        `flag_engine`'s per-type coercion: the segment value is cast to
+        the trait's runtime type before compare, and a cast failure
+        means no match for both ops. Implementation is dialect-specific
+        because trait-type discrimination and runtime type-coercion
+        casts both vary by engine.
+        """
+        ...
+    def trait_in(self, alias: str, trait_key: str, items: list[str]) -> str:
+        """Type-aware IN predicate on a trait, mirroring engine semantics:
+        string trait does direct lookup; integer trait stringifies and
+        looks up; other trait types never match. `items` is the parsed
+        candidate list per `flag_engine`'s `_get_in_values`.
+        """
+        ...
+    # --- string operations ---
+    def position(self, needle_lit: str, haystack_expr: str) -> str:
+        """Boolean: does the string literal `needle_lit` appear in
+        `haystack_expr`? Used for CONTAINS / NOT_CONTAINS."""
+        ...
+    def lpad(self, expr: str, width: int, pad_lit: str) -> str:
+        """Left-pad `expr` to `width` using `pad_lit`."""
+        ...
+    def coalesce(self, *exprs: str) -> str:
+        """COALESCE/NVL-style: first non-null."""
+        ...
+    # --- regex ---
+    def regex_supports(self, pattern: str) -> bool:
+        """Return True if this dialect's regex engine can compile
+        `pattern`. The translator falls back to `None` for any REGEX
+        condition where this returns False, letting the caller defer
+        to `flag_engine`."""
+        ...
+    def regexp_anchored_match(self, value_expr: str, pattern: str) -> str:
+        """Boolean: equivalent to Python `re.match(pattern, value)` —
+        anchored at position 0, may be a prefix of the value, not a
+        full-match.
+        `pattern` is the raw Python regex string; the dialect handles
+        its own escaping into a SQL literal, since regex flavours
+        differ in how backslashes are treated."""
+        ...
+    def regexp_nth_digit_run(self, value_expr: str, n: int) -> str:
+        """Extract the n-th sequence of digits from `value_expr`. Returns NULL
+        if there are fewer than n digit runs. Used for semver."""
+        ...
+    # --- hashing primitives for PERCENTAGE_SPLIT ---
+    def md5_hex(self, expr: str) -> str:
+        """SQL fragment producing the lowercase 32-char hex MD5 digest."""
+        ...
+    def parse_hex_chunk(self, hex_expr: str, start: int, length: int = 8) -> str:
+        """Parse `length` hex characters of `hex_expr` starting at 1-indexed
+        `start` into a non-negative integer."""
+        ...
+    # --- type casts ---
+    def cast_string(self, expr: str) -> str:
+        """Cast `expr` to STRING / VARCHAR."""
+        ...
+    def cast_float(self, expr: str) -> str:
+        """Cast `expr` to a 64-bit float / DOUBLE."""
+        ...
+    def cast_number(self, expr: str) -> str:
+        """Cast `expr` to a NUMBER / BIGINT — the engine-side numeric
+        type used for modulo arithmetic."""
+        ...
+    # --- composition ---
+    def mod(self, dividend: str, divisor: str) -> str:
+        """`dividend MOD divisor` returning a numeric value."""
+        ...

flagsmith_sql_flag_engine/dialects/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Dialect implementations."""
+from flagsmith_sql_flag_engine.dialects.clickhouse import ClickHouseDialect
+__all__ = ["ClickHouseDialect"]

flagsmith_sql_flag_engine/dialects/clickhouse.py ADDED Viewed

@@ -0,0 +1,329 @@
+"""ClickHouse dialect: SQL fragments tailored to ClickHouse's function set.
+## Expected schema
+The translator emits predicates against a single `IDENTITIES` table —
+four typed columns `environment_id`, `id`, `identifier`, `identity_key`,
+plus one `JSON` column `traits` holding the identity's full trait map
+in ClickHouse's native columnar JSON layout. Trait keys are JSON paths
+on the column, not schema columns.
+The `JSON` type was chosen over `Nullable(String)` + `JSONExtract*`
+because:
+  - It stores each path as a typed subcolumn, so trait reads are a
+    direct columnar scan — no per-row JSON parse. Empirically: at 870M
+    rows on a Cloud trial, simple/multi predicates dropped from 14-20×
+    slower than Snowflake VARIANT to within 2.5-4×. The wide-String
+    variant scales linearly with row count where Snowflake / `JSON`
+    stay near-flat.
+  - Schema evolution is implicit: new trait keys appear as new
+    subcolumns at INSERT time, no DDL change.
+  - It matches Snowflake `VARIANT`'s semantic model — same NULL-on-miss
+    behaviour, same type discrimination, same path syntax cost shape.
+The trade-off is that ClickHouse caps `max_dynamic_paths` per JSON
+column (default 1024). Above that, additional paths spill into a
+`Dynamic` catch-all and lose the columnar fast path. This is fine for
+typical Flagsmith trait vocabularies; we should monitor.
+## Notable choices
+  - Subcolumn access uses backtick-quoted identifiers: ``i.traits.`key` ``.
+    Backticks are doubled to escape; arbitrary trait keys including
+    spaces and dots are supported. CH's `getSubcolumn(json, 'key')`
+    function works but doesn't compose with the typed-variant `.:Type`
+    accessor, so we standardise on backtick form everywhere.
+  - `trait_path` returns the trait's canonical string form via
+    `toString(<sub>)`, with a leading `IS NULL` guard so missing keys
+    and JSON null surface as SQL NULL. Mirrors Snowflake's `::STRING`
+    semantics — downstream regex / position / compare paths get
+    unquoted strings, decimal digits for numerics, and `'true'` /
+    `'false'` for bools.
+  - `trait_eq` (positive) leads with a `toString(<sub>) = <lit>` fast
+    path — covers String + canonical-stringified Int / UInt / Float +
+    lowercase Bool in one subcolumn read. A typed-variant Bool branch
+    (``<sub>.:Bool = <target>``) picks up Python-bool-repr "True" /
+    "False" coercions, and a `toFloat64OrNull(toString(<sub>))` branch
+    catches floats whose canonical toString integer-trims (1.0 → '1').
+    Mirrors Snowflake's `v::STRING` fast path. `NOT_EQUAL` still does
+    explicit per-type dispatch via typed-variant subcolumns
+    (``.:String``, ``.:Int64``, ``.:UInt64``, ``.:Float64``, ``.:Bool``);
+    each accessor is NULL when the JSON value is the wrong type, which
+    matches the engine's "cast failed → False" semantics.
+  - Anchored regex uses `match(value, '^(...)')` — ClickHouse's `match`
+    is RE2 and unanchored, so we prepend `^` to mirror Python's
+    `re.match` (start-anchored, prefix-allowed, not full-match).
+  - n-th digit run uses `extractAll(value, '\\d+')[n]`; ClickHouse's
+    array subscript is 1-indexed and returns `''` for out-of-bounds, so
+    we `nullIf(..., '')` to keep the engine's "no n-th run" → NULL
+    contract.
+  - Hex-chunk parsing reads directly from the raw 16-byte MD5 output
+    rather than round-tripping through hex. `MD5(expr)` returns a
+    `FixedString(16)`; `reinterpretAsUInt32(reverse(substring(...)))`
+    pulls a big-endian UInt32 out of any 4-byte slice. Skipping the
+    `hex(MD5(...))` → `unhex(substring(...))` round-trip is a small but
+    consistent speedup on `% Split`-heavy predicates.
+## Setup
+`JSON` type DDL requires `SET allow_experimental_json_type = 1` on
+ClickHouse Cloud as of 25.12 (no longer experimental on OSS 25.x).
+Callers should apply this setting at session creation."""
+from flagsmith_sql_flag_engine.utils import re2_safe, string_literal
+SCHEMA_DDL = """\
+CREATE TABLE IF NOT EXISTS IDENTITIES (
+    -- environment.key from EnvironmentContext; used as the env partition
+    environment_id String,
+    -- stable per-identity row id
+    id UInt64,
+    -- the identity's external identifier, exposed as $.identity.identifier
+    identifier String,
+    -- the composite identity key, exposed as $.identity.key
+    identity_key String,
+    -- the identity's full trait map. ClickHouse's `JSON` type stores each
+    -- path as a typed subcolumn so trait lookups are columnar reads, not
+    -- per-row JSON parses. SQL NULL for an identity with no traits.
+    traits JSON
+)
+ENGINE = MergeTree()
+ORDER BY (environment_id, id);
+"""
+def _backtick(trait_key: str) -> str:
+    """Escape a trait key for use as a backtick-quoted JSON subcolumn name.
+    Doubles embedded backticks per CH's identifier escape rule."""
+    return "`" + trait_key.replace("`", "``") + "`"
+def _non_null(expr: str) -> str:
+    """Coerce a possibly-`Nullable(String)` expression down to non-nullable
+    `String`. ClickHouse rejects regex functions (`match`, `extractAll`)
+    over `Nullable(String)` because the inferred result types
+    `Nullable(UInt8)` / `Nullable(Array(String))` aren't representable.
+    The translator always guards these calls with `IS NOT NULL`, so the
+    coalesce default is unreachable at runtime."""
+    return f"ifNull({expr}, '')"
+class ClickHouseDialect:
+    name = "clickhouse"
+    schema_ddl = SCHEMA_DDL
+    # ----- IDENTITIES schema access -----
+    def identifier_expr(self, alias: str) -> str:
+        return f"{alias}.identifier"
+    def identity_key_expr(self, alias: str) -> str:
+        return f"{alias}.identity_key"
+    def _sub(self, alias: str, trait_key: str) -> str:
+        """The raw JSON subcolumn reference for a trait key.
+        ``alias.traits.`key` `` — Dynamic-typed, NULL for missing keys
+        and explicit JSON null."""
+        return f"{alias}.traits.{_backtick(trait_key)}"
+    def trait_path(self, alias: str, trait_key: str) -> str:
+        # Return the trait's canonical string form, mirroring Snowflake's
+        # `i.traits:"key"::STRING`:
+        #
+        #   - missing key       → NULL
+        #   - JSON null value   → NULL
+        #   - JSON string "x"   → 'x' (quotes stripped)
+        #   - JSON int / float  → '42' / '3.14'
+        #   - JSON true / false → 'true' / 'false'
+        #
+        # `toString` over a JSON subcolumn does the right canonicalisation
+        # natively. The `IS NULL` guard distinguishes missing from a
+        # JSON empty string (`""` round-trips as `''` through toString,
+        # the same value `toString(NULL)` produces) — the translator's
+        # `IS NULL` / `IS NOT NULL` checks rely on this distinction.
+        sub = self._sub(alias, trait_key)
+        return f"if({sub} IS NULL, NULL, toString({sub}))"
+    def trait_eq(self, alias: str, trait_key: str, value: object, negate: bool) -> str:
+        sub = self._sub(alias, trait_key)
+        str_value = str(value)
+        str_lit = string_literal(str_value)
+        # Engine bool cast: `v not in ("False", "false")`. A JSON true matches
+        # every segment value except literal "False" / "false"; those two coerce
+        # to False and match a JSON false.
+        bool_target = "true" if str_value not in ("False", "false") else "false"
+        # Engine int / float cast: ValueError → no match for that branch.
+        try:
+            int_lit: str | None = str(int(str_value))
+        except (ValueError, TypeError):
+            int_lit = None
+        try:
+            float_lit: str | None = repr(float(str_value))
+        except (ValueError, TypeError):
+            float_lit = None
+        # `toString(<sub>)` returns the JSON value's canonical string form
+        # in a single subcolumn read — 'x' for String, '42' for Int / UInt,
+        # '3.14' for Float, 'true' / 'false' for Bool. Mirrors Snowflake's
+        # `v::STRING` and lets us collapse the typical match path to one
+        # comparison instead of an OR across five typed-variant subcolumns.
+        str_path = f"toString({sub})"
+        bool_sub = f"{sub}.:Bool"
+        if not negate:
+            # Fast path: covers String + canonical-stringified Int / UInt /
+            # Float + lowercase Bool ('true' / 'false') in one branch.
+            clauses = [f"({str_path} = {str_lit})"]
+            # Bool branch: engine treats any segment value except "False" /
+            # "false" as bool True, so a JSON true trait must match e.g.
+            # `EQUAL("flag", "growth")`. The fast path catches the
+            # lowercase case; this branch picks up Python-bool-repr "True"
+            # / "False" and any other coercion that doesn't string-match
+            # 'true' / 'false' directly.
+            clauses.append(f"({bool_sub} = {bool_target})")
+            # Float branch: floats whose `toString` integer-trims (1.0 →
+            # '1') miss the fast path against a `'1.0'` segment value.
+            # `toFloat64OrNull(str_path)` covers Int / UInt / Float
+            # uniformly; non-numeric traits stringify to something
+            # `toFloat64OrNull` rejects → NULL → no match.
+            if float_lit is not None and float_lit != str_value:
+                clauses.append(f"(toFloat64OrNull({str_path}) = {float_lit})")
+            return "(" + " OR ".join(clauses) + ")"
+        # NOT_EQUAL: per-type dispatch. Engine returns True only when the
+        # cast succeeded *and* values differ. `.:Type IS NOT NULL AND .:Type
+        # <> lit` encodes that directly; types where the segment value can't
+        # cast contribute FALSE.
+        no_match = "FALSE"
+        str_sub = f"{sub}.:String"
+        int_sub = f"{sub}.:Int64"
+        uint_sub = f"{sub}.:UInt64"
+        float_sub = f"{sub}.:Float64"
+        bool_branch = f"({bool_sub} IS NOT NULL AND {bool_sub} <> {bool_target})"
+        if int_lit is not None or float_lit is not None:
+            num_lit = int_lit if int_lit is not None else float_lit
+            num_branch = (
+                f"(({int_sub} IS NOT NULL AND {int_sub} <> {num_lit})"
+                f" OR ({uint_sub} IS NOT NULL AND {uint_sub} <> {num_lit})"
+                f" OR ({float_sub} IS NOT NULL AND {float_sub} <> {num_lit}))"
+            )
+        else:
+            num_branch = no_match
+        return (
+            f"(({str_sub} IS NOT NULL AND {str_sub} <> {str_lit}) OR {bool_branch} OR {num_branch})"
+        )
+    def trait_in(self, alias: str, trait_key: str, items: list[str]) -> str:
+        # `toString(<sub>)` returns the canonical string form for any JSON
+        # value type in a single subcolumn read. Engine semantics only
+        # match String and integer trait types — bool / float / array
+        # traits never match — so we gate the toString-based IN compare on
+        # `.:Bool IS NULL AND .:Float64 IS NULL`. Int / UInt traits pass
+        # because their stringified form ('42') matches the item literals;
+        # missing keys propagate NULL through toString and fail the IN.
+        sub = self._sub(alias, trait_key)
+        bool_sub = f"{sub}.:Bool"
+        float_sub = f"{sub}.:Float64"
+        str_path = f"toString({sub})"
+        item_lits = ",".join(string_literal(v) for v in items)
+        return f"({bool_sub} IS NULL AND {float_sub} IS NULL AND {str_path} IN ({item_lits}))"
+    # ----- string operations -----
+    def position(self, needle_lit: str, haystack_expr: str) -> str:
+        # ClickHouse's argument order is (haystack, needle), opposite of
+        # Snowflake's POSITION(needle, haystack). Returns 1-indexed
+        # position, 0 for not-found.
+        return f"position({haystack_expr}, {needle_lit}) > 0"
+    def lpad(self, expr: str, width: int, pad_lit: str) -> str:
+        return f"leftPad({expr}, {width}, {pad_lit})"
+    def coalesce(self, *exprs: str) -> str:
+        return f"coalesce({', '.join(exprs)})"
+    # ----- regex -----
+    def regex_supports(self, pattern: str) -> bool:
+        # ClickHouse's regex engine is RE2 (`match`, `extractAll`).
+        return re2_safe(pattern)
+    @staticmethod
+    def _regex_literal(pattern: str) -> str:
+        # ClickHouse string literals process `\` as an escape, so a SQL
+        # `'\d'` reaches the regex engine as `d`. Double the backslashes so
+        # the engine sees `\d`; SQL single quotes are escaped by doubling
+        # per the SQL standard.
+        doubled = pattern.replace("\\", "\\\\").replace("'", "''")
+        return f"'{doubled}'"
+    def regexp_anchored_match(self, value_expr: str, pattern: str) -> str:
+        # `match` is RE2 but unanchored — equivalent to `re.search`. Prepend
+        # `^` to get `re.match` semantics (start-anchored, prefix-allowed).
+        # Wrapping in `(...)` keeps the user's top-level alternation from
+        # binding tighter than the anchor.
+        anchored = "^(" + pattern + ")"
+        return f"match({_non_null(value_expr)}, {self._regex_literal(anchored)})"
+    def regexp_nth_digit_run(self, value_expr: str, n: int) -> str:
+        # `extractAll` returns the matches array; subscript is 1-indexed
+        # and yields `''` past the end. `nullIf` collapses that to NULL so
+        # `COALESCE` upstream can fall back to `'0'`. `ifNull` coerces a
+        # `Nullable(String)` input down to `String` — ClickHouse refuses
+        # `extractAll` on `Nullable(String)` because the inferred result
+        # type `Nullable(Array(String))` is unrepresentable.
+        digit_run = self._regex_literal("\\d+")
+        return f"nullIf(extractAll({_non_null(value_expr)}, {digit_run})[{n}], '')"
+    # ----- hashing -----
+    def md5_hex(self, expr: str) -> str:
+        # Return the raw 16-byte MD5 digest rather than the hex string.
+        # `parse_hex_chunk` below reads bytes directly via
+        # `reinterpretAsUInt32(reverse(substring(...)))`, skipping the
+        # `hex` → `unhex` round-trip — small but consistent win on
+        # PERCENTAGE_SPLIT-heavy predicates.
+        return f"MD5({expr})"
+    def parse_hex_chunk(self, hex_expr: str, start: int, length: int = 8) -> str:
+        # `hex_expr` is the raw `FixedString(16)` from `md5_hex` (not a hex
+        # string). Map the 1-indexed hex start position to a 1-indexed byte
+        # position: hex 1 → byte 1, hex 9 → byte 5, hex 17 → byte 9,
+        # hex 25 → byte 13. 8 hex chars = 4 raw bytes.
+        byte_start = (start - 1) // 2 + 1
+        byte_length = length // 2
+        slice_expr = f"substring({hex_expr}, {byte_start}, {byte_length})"
+        # `reinterpretAsUInt32` reads bytes little-endian; `reverse` first
+        # so the value equals `int(hex_chars, 16)` for the corresponding
+        # hex slice — preserves `_HASH_CONST_*` constants from the translator.
+        return f"reinterpretAsUInt32(reverse({slice_expr}))"
+    # ----- casts -----
+    def cast_string(self, expr: str) -> str:
+        return f"toString({expr})"
+    def cast_float(self, expr: str) -> str:
+        # `toFloat64OrNull` over the string form sidesteps `toFloat64`'s
+        # exception on a non-numeric input — engine behaviour on a cast
+        # failure is "doesn't match", which NULL propagation through the
+        # surrounding predicate gives us.
+        return f"toFloat64OrNull(toString({expr}))"
+    def cast_number(self, expr: str) -> str:
+        return f"toInt64OrNull(toString({expr}))"
+    # ----- composition -----
+    def mod(self, dividend: str, divisor: str) -> str:
+        return f"modulo({dividend}, {divisor})"

flagsmith_sql_flag_engine/py.typed ADDED Viewed

File without changes

flagsmith_sql_flag_engine/translator.py ADDED Viewed

@@ -0,0 +1,569 @@
+"""Translate `SegmentContext` predicate trees into SQL `WHERE` expressions.
+Output drops into:
+    SELECT ... FROM IDENTITIES i
+    WHERE i.environment_id = '<env-key>' AND <translator output>
+Returns `None` if any condition uses an operator the active dialect
+can't translate — callers fall back to `flag_engine.is_context_in_segment`.
+"""
+import json
+from typing import Literal, NamedTuple
+import jsonpath_rfc9535
+from flag_engine.context.types import (
+    EvaluationContext,
+    SegmentCondition,
+    SegmentContext,
+    SegmentRule,
+)
+from flag_engine.segments.evaluator import is_context_in_segment
+from flag_engine.segments.types import ConditionOperator
+from flagsmith_sql_flag_engine.dialect import Dialect
+from flagsmith_sql_flag_engine.utils import (
+    escape_string,
+    modulo_literal,
+    numeric_literal,
+    string_literal,
+)
+TRANSLATABLE_OPERATORS: frozenset[ConditionOperator] = frozenset(
+    {
+        "EQUAL",
+        "NOT_EQUAL",
+        "IN",
+        "IS_SET",
+        "IS_NOT_SET",
+        "CONTAINS",
+        "NOT_CONTAINS",
+        "GREATER_THAN",
+        "LESS_THAN",
+        "GREATER_THAN_INCLUSIVE",
+        "LESS_THAN_INCLUSIVE",
+        "MODULO",
+        "PERCENTAGE_SPLIT",
+        "REGEX",
+    }
+)
+# Constants for chunked MD5-mod-9999 hash. The engine computes
+# `int(md5_hex, 16) % 9999`; we split the 32-hex digest into four 8-hex
+# chunks, parse each as a 32-bit int, and combine via modular arithmetic.
+# Constants are (16^24, 16^16, 16^8) mod 9999, precomputed.
+_HASH_CONST_HIGH = 7291  # 16^24 mod 9999
+_HASH_CONST_MID = 1897  # 16^16 mod 9999
+_HASH_CONST_LOW = 6835  # 16^8 mod 9999
+# ---------------------------------------------------------------------------
+# Context: shape information the translator needs to produce correct refs.
+# ---------------------------------------------------------------------------
+class TranslateContext:
+    """Inputs the translator needs to produce a query for a specific shape.
+    `evaluation_context` is a flag_engine `EvaluationContext`. Its
+    `identity` field is ignored since identity values come from each
+    `IDENTITIES` row at SQL execution time. `dialect` is an
+    implementation of the `Dialect` protocol; it owns the IDENTITIES
+    schema, so column references come from dialect methods rather than
+    being configured here. `identities_alias` is the table alias for
+    `IDENTITIES` in the surrounding query — defaults to `i`.
+    `segment_key` salts `PERCENTAGE_SPLIT` and is auto-injected from
+    the segment's `key` field by `translate_segment`.
+    """
+    def __init__(
+        self,
+        evaluation_context: EvaluationContext,
+        dialect: Dialect,
+        identities_alias: str = "i",
+        segment_key: str | None = None,
+    ) -> None:
+        self.evaluation_context = evaluation_context
+        self.dialect = dialect
+        self.identities_alias = identities_alias
+        self.segment_key = segment_key
+    @property
+    def identity_key_expr(self) -> str:
+        return self.dialect.identity_key_expr(self.identities_alias)
+    def trait_path(self, trait_key: str) -> str:
+        """Dialect-specific path-extraction for a trait value."""
+        return self.dialect.trait_path(self.identities_alias, trait_key)
+    def jsonpath_expr(self, prop: Literal["$.identity.identifier", "$.identity.key"]) -> str:
+        # Only the row-bound identity columns need an SQL expression — every
+        # other JSONPath property is resolved against the eval context up in
+        # `translate_condition` via `_engine_static_verdict`.
+        match prop:
+            case "$.identity.identifier":
+                return self.dialect.identifier_expr(self.identities_alias)
+            case "$.identity.key":
+                return self.dialect.identity_key_expr(self.identities_alias)
+    def with_segment_key(self, key: str) -> "TranslateContext":
+        return TranslateContext(
+            evaluation_context=self.evaluation_context,
+            dialect=self.dialect,
+            identities_alias=self.identities_alias,
+            segment_key=key,
+        )
+# ---------------------------------------------------------------------------
+# Inline SQL builders for hash-based and version-based predicates.
+# ---------------------------------------------------------------------------
+def _percentage_split_expr(
+    ctx: TranslateContext, seg_key: str, ctx_value_sql: str, threshold: float
+) -> str:
+    """Boolean SQL fragment: hash(seg_key + "," + value) <= threshold.
+    Mirrors `flag_engine.utils.hashing.get_hashed_percentage_for_object_ids`
+    via four 8-hex-char chunks combined modulo 9999. Diverges from the
+    engine on the ~1/9999 inputs where the bare hash mod 9999 == 9998 —
+    the engine recurses with doubled input; we don't.
+    """
+    d = ctx.dialect
+    seg_lit = string_literal(seg_key)
+    hash_subject = f"{seg_lit} || ',' || ({ctx_value_sql})"
+    h = d.md5_hex(hash_subject)
+    s1 = d.parse_hex_chunk(h, 1)
+    s2 = d.parse_hex_chunk(h, 9)
+    s3 = d.parse_hex_chunk(h, 17)
+    s4 = d.parse_hex_chunk(h, 25)
+    weighted = (
+        f"{s1} * {_HASH_CONST_HIGH} + {s2} * {_HASH_CONST_MID} + {s3} * {_HASH_CONST_LOW} + {s4}"
+    )
+    return f"({d.mod(weighted, '9999')} / 9998.0 * 100.0 <= {float(threshold)})"
+def _semver_sort_key_expr(ctx: TranslateContext, value_sql: str) -> str:
+    """Sortable padded major.minor.patch key. String-comparing two outputs of
+    this gives the engine's GT/GTE/LT/LTE/EQ/NE result for the
+    major.minor.patch portion. Prerelease is ignored."""
+    d = ctx.dialect
+    parts = [
+        d.lpad(d.coalesce(d.regexp_nth_digit_run(value_sql, n), "'0'"), 10, "'0'")
+        for n in (1, 2, 3)
+    ]
+    return f"({parts[0]} || '.' || {parts[1]} || '.' || {parts[2]})"
+# ---------------------------------------------------------------------------
+# Trait-bound and direct comparisons. Both go against IDENTITIES alias `i`
+# directly: trait conditions read `i."<trait>"`, JSONPath conditions read
+# the appropriate identity column or env literal.
+# ---------------------------------------------------------------------------
+JsonpathKind = Literal[
+    "identifier",
+    "key",
+    "trait",
+    "identity_object",
+    "untranslatable",
+    "static",
+]
+class JsonpathClassification(NamedTuple):
+    """What a JSONPath property resolves to in the SQL setting.
+    `kind` selects the shape; `trait_key` carries the trait name only when
+    `kind == "trait"`.
+    """
+    kind: JsonpathKind
+    trait_key: str | None = None
+def _classify_jsonpath(prop: str) -> JsonpathClassification:
+    """Classify a JSONPath property by what it resolves to in the SQL setting.
+    Identity is per-row in our query model — each `IDENTITIES` row IS an
+    identity — but the engine treats `$.identity.*` as a lookup against
+    the eval-context identity. Most identity-bound paths therefore need
+    to map to a row reference, not be statically pre-computed against
+    the eval context.
+    A `prop` that doesn't parse as JSONPath classifies as a trait keyed
+    by the prop string itself — the engine treats unparseable `$.`-
+    prefixed properties as literal trait keys, and we mirror that.
+    """
+    try:
+        compiled = jsonpath_rfc9535.compile(prop)
+    except jsonpath_rfc9535.JSONPathSyntaxError:
+        return JsonpathClassification("trait", prop)
+    names: list[str] = []
+    for s in compiled.segments:
+        if len(s.selectors) != 1:  # pragma: no cover - multi-selector segments not in dataset
+            break
+        name = getattr(s.selectors[0], "name", None)
+        if name is None:
+            break
+        names.append(name)
+    else:
+        if names and names[0] == "identity":
+            if len(names) == 1:
+                # `$.identity` — the whole identity object. Every row in
+                # the IDENTITIES table IS an identity by construction,
+                # so we don't go through the eval context — which may or
+                # may not carry an identity, depending on caller. The
+                # translator encodes the row-truth directly: IS_SET →
+                # TRUE, IS_NOT_SET → FALSE, scalar comparators → FALSE,
+                # mirroring the engine's fail-cast on a dict.
+                return JsonpathClassification("identity_object")
+            if len(names) == 2 and names[1] == "identifier":
+                return JsonpathClassification("identifier")
+            if len(names) == 2 and names[1] == "key":
+                return JsonpathClassification("key")
+            if len(names) == 3 and names[1] == "traits":
+                return JsonpathClassification("trait", names[2])
+            return JsonpathClassification("untranslatable")
+    if names and names[0] == "identity":
+        # Identity path with non-name selectors — wildcards, filters,
+        # etc. — we can't map those to fixed row references.
+        return JsonpathClassification("untranslatable")
+    return JsonpathClassification("static")
+def _engine_static_verdict(ctx: TranslateContext, cond: SegmentCondition) -> str:
+    """Run a single condition through `is_context_in_segment` against the
+    eval context and emit `'TRUE'`/`'FALSE'`. Used for JSONPath conditions
+    that don't reference row-bound state — the verdict is the same for
+    every row in the resulting query, so we collapse it now."""
+    fake_segment: SegmentContext = {
+        "key": ctx.segment_key or "_static",
+        "name": "_static",
+        "rules": [{"type": "ALL", "conditions": [cond]}],
+    }
+    matches = is_context_in_segment(ctx.evaluation_context, fake_segment)
+    return "TRUE" if matches else "FALSE"
+def _engine_in_values(value: object) -> list[str] | None:
+    """Mirror `flag_engine.segments.evaluator._get_in_values`: parse a segment
+    value into a list of candidate strings. Returns None for inputs the
+    engine doesn't accept — anything that's neither a string nor a list."""
+    if isinstance(value, list):
+        return [v if isinstance(v, str) else str(v) for v in value]
+    if not isinstance(value, str):
+        return None
+    if value.startswith("["):
+        try:
+            parsed = json.loads(value)
+        except (ValueError, TypeError):
+            return value.split(",")
+        if isinstance(parsed, list):  # pragma: no branch - `[`-prefixed valid JSON parses as a list
+            return [v if isinstance(v, str) else str(v) for v in parsed]
+    return value.split(",")
+def _comparison(
+    ctx: TranslateContext,
+    op: str,
+    expr: str,
+    value: object,
+    is_jsonpath: bool = False,
+) -> str | None:
+    """Emit a SQL fragment comparing `expr` against `value` per `op`.
+    Used for both trait references — cast via the dialect as needed —
+    and JSONPath references, which arrive as already-typed columns or
+    string literals.
+    Returns `None` only for genuinely untranslatable inputs such as a
+    REGEX pattern the active dialect's regex flavour can't compile.
+    Inputs the engine evaluates to a deterministic False — missing
+    value, non-numeric operand on a comparator — compile to `"FALSE"`.
+    """
+    if value is None:
+        return "FALSE"
+    d = ctx.dialect
+    lit = string_literal(str(value))
+    str_expr = expr if is_jsonpath else d.cast_string(expr)
+    if op == "EQUAL":
+        return f"{str_expr} = {lit}"
+    if op == "NOT_EQUAL":
+        return f"{str_expr} <> {lit}"
+    if op == "IN":
+        items = "','".join(escape_string(v.strip()) for v in str(value).split(","))
+        return f"{str_expr} IN ('{items}')"
+    if op == "CONTAINS":
+        return d.position(lit, str_expr)
+    if op == "NOT_CONTAINS":
+        return f"({expr} IS NOT NULL AND NOT ({d.position(lit, str_expr)}))"
+    if op in {"GREATER_THAN", "LESS_THAN", "GREATER_THAN_INCLUSIVE", "LESS_THAN_INCLUSIVE"}:
+        numeric_lit = numeric_literal(value)
+        if numeric_lit is None:
+            # Engine: float() on a non-numeric operand raises → returns False.
+            return "FALSE"
+        sql_op = {
+            "GREATER_THAN": ">",
+            "LESS_THAN": "<",
+            "GREATER_THAN_INCLUSIVE": ">=",
+            "LESS_THAN_INCLUSIVE": "<=",
+        }[op]
+        return f"({expr} IS NOT NULL AND {d.cast_float(expr)} {sql_op} {numeric_lit})"
+    if op == "MODULO":
+        parsed = modulo_literal(value)
+        if parsed is None:
+            # Bad operand — empty string, missing separator, non-numeric
+            # side. Engine catches the cast error and returns False.
+            return "FALSE"
+        divisor_lit, remainder_lit = parsed
+        mod_expr = d.mod(d.cast_number(expr), divisor_lit)
+        return f"({expr} IS NOT NULL AND ({mod_expr}) = {remainder_lit})"
+    if op == "REGEX":
+        pattern = str(value)
+        if not d.regex_supports(pattern):
+            return None
+        return f"({expr} IS NOT NULL AND {d.regexp_anchored_match(str_expr, pattern)})"
+    raise AssertionError(  # pragma: no cover - all TRANSLATABLE_OPERATORS handled above
+        f"unhandled translatable operator in _comparison: {op}"
+    )
+# ---------------------------------------------------------------------------
+# Condition translation: routes the operator to the right SQL shape.
+# ---------------------------------------------------------------------------
+_SEMVER_OPS = {
+    "EQUAL": "=",
+    "NOT_EQUAL": "<>",
+    "GREATER_THAN": ">",
+    "LESS_THAN": "<",
+    "GREATER_THAN_INCLUSIVE": ">=",
+    "LESS_THAN_INCLUSIVE": "<=",
+}
+def _translate_trait_op(
+    ctx: TranslateContext,
+    trait_key: str,
+    op: ConditionOperator,
+    val: object,
+) -> str | None:
+    """Translate `op` on a literal trait key into SQL. Returns `None`
+    for inputs the translator can't compile, such as a REGEX pattern
+    the active dialect rejects."""
+    path = ctx.trait_path(trait_key)
+    if op == "IS_SET":
+        return f"{path} IS NOT NULL"
+    if op == "IS_NOT_SET":
+        return f"{path} IS NULL"
+    # Semver-marked comparator — the segment value ends with `:semver`.
+    # Engine only invokes its semver path for the comparators below;
+    # other operators treat the `:semver` suffix as ordinary string
+    # content, which is what the fall-through handlers already do.
+    if isinstance(val, str) and val.endswith(":semver") and op in _SEMVER_OPS:
+        bare = val[:-7]
+        bare_lit = string_literal(bare)
+        col_str = ctx.dialect.cast_string(path)
+        return (
+            f"({path} IS NOT NULL AND "
+            f"{_semver_sort_key_expr(ctx, col_str)} {_SEMVER_OPS[op]} "
+            f"{_semver_sort_key_expr(ctx, bare_lit)})"
+        )
+    # Type-aware comparators on traits — delegate to the dialect. The
+    # discriminator funcs like TYPEOF / IS_*, runtime type-coercion
+    # casts, and short-circuit pitfalls are all engine-specific.
+    if op in {"EQUAL", "NOT_EQUAL"} and val is not None:
+        negate = op == "NOT_EQUAL"
+        eq_pred = ctx.dialect.trait_eq(ctx.identities_alias, trait_key, val, negate=negate)
+        return f"({path} IS NOT NULL AND {eq_pred})"
+    if op == "IN":
+        items = _engine_in_values(val)
+        if items is None:
+            # Bad IN value — neither a string nor a list. Engine returns
+            # False.
+            return "FALSE"
+        in_pred = ctx.dialect.trait_in(ctx.identities_alias, trait_key, items)
+        return f"({path} IS NOT NULL AND {in_pred})"
+    return _comparison(ctx, op, path, val, is_jsonpath=False)
+def translate_condition(cond: SegmentCondition, ctx: TranslateContext) -> str | None:
+    op = cond["operator"]
+    if op not in TRANSLATABLE_OPERATORS:
+        return None
+    prop = cond.get("property") or ""
+    val = cond.get("value")
+    # Classify the property up front. Identity-bound JSONPaths —
+    # `$.identity.identifier`, `$.identity.key`, `$.identity.traits.<x>` —
+    # map to row references; non-identity JSONPaths are eval-ctx-bound,
+    # constant for every row, and get pre-computed via the engine. Bare
+    # trait keys bypass the JSONPath compile — they're classified as a
+    # literal trait lookup directly.
+    classification = (
+        _classify_jsonpath(prop) if prop.startswith("$.") else JsonpathClassification("trait", prop)
+    )
+    if classification.kind == "trait":
+        # Trait keys carried via `$.identity.traits.<x>` arrive normalised
+        # to the bare key; literal trait keys come through untouched.
+        assert classification.trait_key is not None
+        prop = classification.trait_key
+    # PERCENTAGE_SPLIT — inline pure-SQL hash.
+    if op == "PERCENTAGE_SPLIT":
+        # `translate_segment` always injects `segment_key` from the segment
+        # before recursing; reaching here without one means a caller invoked
+        # `translate_condition` directly with a half-formed context.
+        assert ctx.segment_key is not None, (
+            "PERCENTAGE_SPLIT requires a segment_key as the hash salt"
+        )
+        threshold_lit = numeric_literal(val)
+        if threshold_lit is None:
+            # Engine: float() on the threshold raises → returns False.
+            return "FALSE"
+        threshold = float(threshold_lit)
+        identity: dict[str, object] = ctx.evaluation_context.get("identity") or {}  # type: ignore[assignment]
+        kind = classification.kind
+        if not prop:
+            # Implicit `$.identity.key` — engine returns False when no
+            # identity, or when the identity lacks `key`. The engine
+            # never synthesises one from env+identifier.
+            if not identity.get("key"):
+                return "FALSE"
+            value_expr = ctx.dialect.cast_string(ctx.identity_key_expr)
+        elif kind == "key":
+            if not identity.get("key"):
+                return "FALSE"
+            value_expr = ctx.dialect.cast_string(ctx.jsonpath_expr("$.identity.key"))
+        elif kind == "identifier":
+            if not identity.get("identifier"):
+                return "FALSE"
+            value_expr = ctx.dialect.cast_string(ctx.jsonpath_expr("$.identity.identifier"))
+        elif kind == "identity_object":
+            # PERCENTAGE_SPLIT on `$.identity` — the whole dict. Engine
+            # hashes `str(dict)`, which is a stable but useless subject;
+            # nobody writes this in practice. Treat as untranslatable.
+            return None
+        elif kind == "untranslatable":
+            # `$.identity.<X>` we don't represent in the row schema.
+            return None
+        elif kind == "static":
+            # Non-identity JSONPath: the engine hashes the resolved value.
+            # We'd need to bake it as a literal hash subject — leave for
+            # future work and let the caller fall back to the engine.
+            return None
+        else:
+            # Plain trait key, or `$.identity.traits.<X>` rewritten to
+            # the bare key. Hash subject pulls from `i.traits:"<key>"`
+            # per row.
+            traits = identity.get("traits") or {}
+            if not isinstance(traits, dict) or prop not in traits:
+                return "FALSE"
+            value_expr = ctx.dialect.cast_string(ctx.trait_path(prop))
+        return _percentage_split_expr(ctx, ctx.segment_key, value_expr, threshold)
+    if not prop:
+        # Non-PERCENTAGE_SPLIT condition without a property — engine looks up
+        # nothing, the comparator's cast fails, returns False.
+        return "FALSE"
+    if classification.kind == "trait":
+        return _translate_trait_op(ctx, prop, op, val)
+    # Non-trait classifications. We don't replicate the engine's per-row
+    # trait-first dispatch — it would roughly double the cost of every
+    # wrapped JSONPath condition. A row that happens to carry a trait
+    # literally named e.g. `$.identity` would shadow our resolution.
+    # Niche shape; the engine-parity suite xfails the one engine-test-
+    # data case that hits it.
+    if classification.kind in ("identifier", "key"):
+        path = ctx.jsonpath_expr(
+            "$.identity.identifier" if classification.kind == "identifier" else "$.identity.key"
+        )
+        if op == "IS_SET":
+            return "TRUE"
+        if op == "IS_NOT_SET":
+            return "FALSE"
+        return _comparison(ctx, op, path, val, is_jsonpath=True)
+    if classification.kind == "identity_object":
+        # `$.identity` — engine treats non-primitive lookups as "not
+        # set" by design; no operator meaningfully takes an object. So
+        # IS_SET → FALSE, IS_NOT_SET → TRUE, every scalar comparator
+        # fail-casts on the dict → FALSE. The SQL answer is the same
+        # for every row regardless of whether the eval context carries
+        # an identity, so we encode it directly.
+        return "TRUE" if op == "IS_NOT_SET" else "FALSE"
+    if classification.kind == "untranslatable":
+        # Identity-bound JSONPath we can't map to row state — caller falls
+        # back to the engine.
+        return None
+    # static
+    return _engine_static_verdict(ctx, cond)
+# ---------------------------------------------------------------------------
+# Rule and segment translation: Boolean composition over conditions.
+# ---------------------------------------------------------------------------
+def translate_rule(rule: SegmentRule, ctx: TranslateContext) -> str | None:
+    children: list[str] = []
+    for cond in rule.get("conditions") or []:
+        sql = translate_condition(cond, ctx)
+        if sql is None:
+            return None
+        children.append(f"({sql})")
+    for nested in rule.get("rules") or []:
+        sql = translate_rule(nested, ctx)
+        if sql is None:
+            return None
+        children.append(f"({sql})")
+    assert children, "segment rule must have at least one condition or nested rule"
+    match rule["type"]:
+        case "ALL":
+            return " AND ".join(children)
+        case "ANY":
+            return " OR ".join(children)
+        case "NONE":
+            return f"NOT ({' OR '.join(children)})"
+def translate_segment(segment: SegmentContext, ctx: TranslateContext) -> str | None:
+    """Return a SQL `WHERE` expression for the segment.
+    Output shape::
+        SELECT ... FROM IDENTITIES i
+        WHERE i.environment_id = '<env-key>'
+          AND <returned expression>
+    The caller composes the surrounding query; the translator only
+    produces the predicate.
+    Returns `None` if any condition uses an untranslatable operator —
+    currently a REGEX pattern the active dialect's regex flavour can't
+    compile. Callers should fall back to
+    `flag_engine.is_context_in_segment` for those segments.
+    """
+    ctx = ctx.with_segment_key(segment["key"])
+    rules = segment.get("rules") or []
+    if not rules:
+        return "FALSE"
+    rule_sql: list[str] = []
+    for r in rules:
+        sql = translate_rule(r, ctx)
+        if sql is None:
+            return None
+        rule_sql.append(f"({sql})")
+    return " AND ".join(rule_sql)

flagsmith_sql_flag_engine/utils.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""SQL escape, validation, and regex-flavour primitives, shared by
+the translator and dialects.
+The translator emits SQL by string composition rather than via a query-
+builder. Every value originating in a `SegmentCondition` or evaluation
+context must be escaped or validated before it lands in a SQL fragment;
+this module is the single home for that logic.
+If you find yourself f-string-interpolating a segment- or context-derived
+value, route it through one of these helpers. Bypassing this layer is how
+SQL injection happens; the audit trail is the call sites here.
+Threat model: segment definitions come from Flagsmith users with
+`MANAGE_SEGMENTS` permission on a project — trusted-but-not-fully-trusted.
+A malicious operand value must not be able to escalate to arbitrary SQL
+execution against the analytical store.
+Functions in this module are dialect-agnostic. Anything that depends on
+SQL-engine syntax — VARIANT path quoting, JSONB extraction, casts — lives
+on the `Dialect` protocol instead.
+"""
+import re
+def escape_string(value: str) -> str:
+    """Double single quotes for inclusion inside a SQL string literal.
+    Use when the caller is composing a larger literal — for example a
+    CSV-style `IN ('a','b','c')` — and wants the un-wrapped escape. For
+    a single standalone value, prefer `string_literal`.
+    """
+    return value.replace("'", "''")
+def string_literal(value: str) -> str:
+    """Wrap a value as a single-quoted SQL string literal."""
+    return "'" + escape_string(value) + "'"
+def numeric_literal(value: object) -> str | None:
+    """Validate `value` is numeric and return its canonical-float string form.
+    Returns `None` if `value` is not parseable as a float — the caller
+    propagates that as "untranslatable" rather than injecting unparseable
+    SQL.
+    Booleans are rejected explicitly: `float(True) == 1.0` in Python,
+    but the engine treats segment-value booleans as strings via its
+    type-coercion path, so a numeric interpretation here would diverge.
+    """
+    if isinstance(value, bool):
+        return None
+    try:
+        return str(float(value))  # type: ignore[arg-type]
+    except (TypeError, ValueError):
+        return None
+# Conservative check for Python-re features RE2 doesn't support.
+_RE2_UNSAFE = re.compile(
+    r"\\\d"  # backreference like \1 .. \9
+    r"|\(\?[=!<]"  # lookahead / lookbehind / negative variants
+)
+def re2_safe(pattern: str) -> bool:
+    """Return True if `pattern` uses only features RE2 supports.
+    RE2 explicitly excludes backreferences and lookarounds. Use this as
+    the regex feature-detector in dialects whose SQL engine uses RE2 —
+    Snowflake, BigQuery, DuckDB, ClickHouse.
+    """
+    return _RE2_UNSAFE.search(pattern) is None
+def modulo_literal(value: object) -> tuple[str, str] | None:
+    """Parse a `divisor|remainder` MODULO operand pair.
+    Returns `(divisor, remainder)` as canonical-float string forms, or
+    `None` if either side fails to parse.
+    """
+    try:
+        divisor_str, remainder_str = str(value).split("|")
+        return str(float(divisor_str)), str(float(remainder_str))
+    except (ValueError, AttributeError):
+        return None

flagsmith_sql_flag_engine-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,153 @@
+Metadata-Version: 2.4
+Name: flagsmith-sql-flag-engine
+Version: 0.1.0
+Summary: SQL translator for Flagsmith segment predicates.
+Author: Flagsmith
+Author-email: Flagsmith <engineering@flagsmith.com>
+License-Expression: BSD-3-Clause
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: SQL
+Classifier: Topic :: Database
+Requires-Dist: flagsmith-flag-engine>=10
+Requires-Dist: jsonpath-rfc9535>=0.2
+Requires-Python: >=3.10
+Project-URL: Homepage, https://github.com/Flagsmith/flagsmith-sql-flag-engine
+Description-Content-Type: text/markdown
+# flagsmith-sql-flag-engine
+SQL translator for Flagsmith segment predicates.
+Where the Python and Rust `flag_engine` implementations evaluate
+`is_context_in_segment` against an in-memory `EvaluationContext`, this
+package takes a `SegmentContext` and emits a SQL `WHERE` expression that
+evaluates the segment against an entire `IDENTITIES` table — one row per
+identity, with the identity's full trait map held in a single column
+the translator path-extracts at query time. `PERCENTAGE_SPLIT` and
+`:semver`-marked comparators compile to inline pure-SQL.
+## Quickstart
+```python
+from flag_engine.context.types import EvaluationContext, SegmentContext
+from flagsmith_sql_flag_engine import TranslateContext, translate_segment
+from flagsmith_sql_flag_engine.dialects import ClickHouseDialect
+eval_context: EvaluationContext = {
+    "environment": {"key": "n9fbf9...3ngWhb", "name": "Production"},
+}
+ctx = TranslateContext(evaluation_context=eval_context, dialect=ClickHouseDialect())
+segment: SegmentContext = {
+    "key": "growth-cohort",
+    "name": "Growth cohort",
+    "rules": [
+        {
+            "type": "ALL",
+            "conditions": [
+                {"operator": "EQUAL", "property": "plan", "value": "growth"},
+            ],
+        },
+    ],
+}
+where_expr = translate_segment(segment, ctx)
+# where_expr is a SQL string. Drop into:
+#   SELECT COUNT(*) FROM IDENTITIES i
+#   WHERE i.environment_id = 'n9fbf9...3ngWhb' AND ({where_expr})
+```
+`environment_id` in the `IDENTITIES` table is a string column holding
+`EnvironmentContext.key` directly — the same identifier the engine uses,
+no separate integer PK.
+`translate_segment` returns `None` if the segment uses an operator the
+translator can't handle — typically a REGEX pattern the active dialect's
+regex flavour can't compile. Callers should fall back to
+`flag_engine.is_context_in_segment` for those segments.
+## Schema
+Each dialect publishes the table layout it expects via a `schema_ddl`
+constant. For ClickHouse:
+```sql
+CREATE TABLE IF NOT EXISTS IDENTITIES (
+    environment_id String,
+    id UInt64,
+    identifier String,
+    identity_key String,
+    traits JSON
+)
+ENGINE = MergeTree()
+ORDER BY (environment_id, id);
+```
+Traits live in a single `JSON` column (CH 24+, GA in 25.x). Each key is
+stored as a typed subcolumn, so trait reads are direct columnar scans
+rather than per-row JSON parses. Trait keys are *data* — new keys appear
+without schema changes — and the translator only sees the abstract path
+extraction.
+ClickHouse Cloud requires `SET allow_experimental_json_type = 1` when
+creating a `JSON`-column table (the type is GA on OSS 25.x); the test
+harness applies this setting automatically.
+Programmatic access:
+```python
+from flagsmith_sql_flag_engine.dialects.clickhouse import SCHEMA_DDL
+```
+## Engine parity
+Validated against [Flagsmith/engine-test-data](https://github.com/Flagsmith/engine-test-data),
+the test suite every engine implementation is checked against. The
+engine-parity suite loads each test case's identity into a per-dialect
+scratch table, translates the case's segments, runs the generated SQL,
+and compares to `flag_engine.is_context_in_segment`.
+To run the engine-parity suite locally:
+```bash
+git submodule update --init                 # pull engine-test-data
+docker compose up --detach --wait clickhouse
+uv run pytest tests/test_engine.py
+```
+Adding a new dialect's parity coverage is one harness module — see
+`tests/harnesses/` for the shape.
+## Dialects
+The translator is dialect-aware: a `Dialect` protocol abstracts the
+SQL fragments that differ across SQL engines — MD5 hex, hex-to-int
+parsing, prefix-anchored regex, padded-version comparison, type-aware
+trait predicates, regex flavour. Today `ClickHouseDialect` is the only
+implementation; adding another engine such as Snowflake, DuckDB or
+Postgres means writing one class.
+## Operator coverage
+| Operator                                     | Translatable | Notes                                                          |
+| -------------------------------------------- | :----------: | -------------------------------------------------------------- |
+| `EQUAL`, `NOT_EQUAL`, `IN`                   |     yes      |                                                                |
+| `IS_SET`, `IS_NOT_SET`                       |     yes      | trait subcolumn `IS NOT NULL` / `IS NULL`                      |
+| `CONTAINS`, `NOT_CONTAINS`                   |     yes      |                                                                |
+| `GREATER_THAN`, `LESS_THAN` plus `_INCLUSIVE`|     yes      |                                                                |
+| `MODULO`                                     |     yes      |                                                                |
+| `PERCENTAGE_SPLIT`                           |     yes      | inlined MD5-mod-9999; ~0.005% diverge on hash==9998            |
+| `REGEX`                                      |   partial    | dialect-flavour gated; unsupported patterns → caller fallback  |
+| `:semver`-marked comparators                 |     yes      | major.minor.patch only; ignores prerelease                     |
+## Development
+```bash
+make install                  # uv sync + pre-commit install
+make lint                     # run pre-commit hooks across the tree
+make typecheck                # mypy
+make test                     # unit tests
+```
+Ruff (lint + format) runs as a pre-commit hook on every commit. Mypy
+runs as a `make typecheck` hook on staged Python files.

flagsmith_sql_flag_engine-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+flagsmith_sql_flag_engine/__init__.py,sha256=DgiUBg8KnfdisZiMNsWpzKYtDqJoLvxnKctXQGESdZg,710
+flagsmith_sql_flag_engine/dialect.py,sha256=G4rzXszXUVM8iV-r0w_YJgFAWtqWx1hfJI48P9zuTmE,4514
+flagsmith_sql_flag_engine/dialects/__init__.py,sha256=4pkblw-Jr04CLnSbpMud5Vyj5alBgB6WhSm0Cz9OWSI,141
+flagsmith_sql_flag_engine/dialects/clickhouse.py,sha256=Flwy2QNEv-A53rvZoaWv8Q0wytuO8NY4IaRogGr0Ztg,15523
+flagsmith_sql_flag_engine/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+flagsmith_sql_flag_engine/translator.py,sha256=zRWbq_ZMP1VgpQDEMdlN91u4XaUVXC9zSExkIxEraiM,22952
+flagsmith_sql_flag_engine/utils.py,sha256=ygP8cijp1jQjszFj87odK95jdfV3DwaT02eXrQdikAA,3202
+flagsmith_sql_flag_engine-0.1.0.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
+flagsmith_sql_flag_engine-0.1.0.dist-info/METADATA,sha256=g8J3yDLDPgyV-cjsaDD3Y3u424M5_HVc17NRYo45Eso,6137
+flagsmith_sql_flag_engine-0.1.0.dist-info/RECORD,,

flagsmith_sql_flag_engine-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.8.24
+Root-Is-Purelib: true
+Tag: py3-none-any