flagsmith-sql-flag-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ """SQL translator for Flagsmith segment predicates.
2
+
3
+ Public API:
4
+ translate_segment(segment, ctx) -> str | None
5
+ TranslateContext
6
+
7
+ See README.md for usage. The translator is dialect-aware via the `Dialect`
8
+ protocol; `flagsmith_sql_flag_engine.dialects.clickhouse.ClickHouseDialect`
9
+ is the only implementation today.
10
+ """
11
+
12
+ from flagsmith_sql_flag_engine.dialect import Dialect
13
+ from flagsmith_sql_flag_engine.translator import (
14
+ TRANSLATABLE_OPERATORS,
15
+ TranslateContext,
16
+ translate_condition,
17
+ translate_rule,
18
+ translate_segment,
19
+ )
20
+
21
+ __all__ = [
22
+ "TRANSLATABLE_OPERATORS",
23
+ "Dialect",
24
+ "TranslateContext",
25
+ "translate_condition",
26
+ "translate_rule",
27
+ "translate_segment",
28
+ ]
@@ -0,0 +1,125 @@
1
+ """Per-dialect SQL fragments — MD5 hex, hex-to-int parsing, prefix-anchored
2
+ regex, padded-version comparison, type-aware trait predicates, regex flavour."""
3
+
4
+ from typing import Protocol
5
+
6
+
7
+ class Dialect(Protocol):
8
+ """Per-dialect SQL fragments.
9
+
10
+ Methods return SQL string fragments. Inputs are already-formatted SQL
11
+ strings (column refs, string literals); the dialect only chooses the
12
+ right syntax for the operation.
13
+ """
14
+
15
+ name: str # human-readable, used in test ids and error messages
16
+
17
+ # --- IDENTITIES schema access ---
18
+ #
19
+ # The dialect owns the canonical IDENTITIES schema, see `schema_ddl`,
20
+ # so it also owns the SQL expression for each logical column. The
21
+ # translator just hands over an alias.
22
+
23
+ def identifier_expr(self, alias: str) -> str:
24
+ """SQL expression for `$.identity.identifier`."""
25
+ ...
26
+
27
+ def identity_key_expr(self, alias: str) -> str:
28
+ """SQL expression for `$.identity.key`."""
29
+ ...
30
+
31
+ def trait_path(self, alias: str, trait_key: str) -> str:
32
+ """Path-extract a trait value from the IDENTITIES traits container.
33
+
34
+ The path syntax varies by SQL engine.
35
+ """
36
+ ...
37
+
38
+ def trait_eq(self, alias: str, trait_key: str, value: object, negate: bool) -> str:
39
+ """Type-aware EQUAL / NOT_EQUAL predicate on a trait, mirroring
40
+ `flag_engine`'s per-type coercion: the segment value is cast to
41
+ the trait's runtime type before compare, and a cast failure
42
+ means no match for both ops. Implementation is dialect-specific
43
+ because trait-type discrimination and runtime type-coercion
44
+ casts both vary by engine.
45
+ """
46
+ ...
47
+
48
+ def trait_in(self, alias: str, trait_key: str, items: list[str]) -> str:
49
+ """Type-aware IN predicate on a trait, mirroring engine semantics:
50
+ string trait does direct lookup; integer trait stringifies and
51
+ looks up; other trait types never match. `items` is the parsed
52
+ candidate list per `flag_engine`'s `_get_in_values`.
53
+ """
54
+ ...
55
+
56
+ # --- string operations ---
57
+
58
+ def position(self, needle_lit: str, haystack_expr: str) -> str:
59
+ """Boolean: does the string literal `needle_lit` appear in
60
+ `haystack_expr`? Used for CONTAINS / NOT_CONTAINS."""
61
+ ...
62
+
63
+ def lpad(self, expr: str, width: int, pad_lit: str) -> str:
64
+ """Left-pad `expr` to `width` using `pad_lit`."""
65
+ ...
66
+
67
+ def coalesce(self, *exprs: str) -> str:
68
+ """COALESCE/NVL-style: first non-null."""
69
+ ...
70
+
71
+ # --- regex ---
72
+
73
+ def regex_supports(self, pattern: str) -> bool:
74
+ """Return True if this dialect's regex engine can compile
75
+ `pattern`. The translator falls back to `None` for any REGEX
76
+ condition where this returns False, letting the caller defer
77
+ to `flag_engine`."""
78
+ ...
79
+
80
+ def regexp_anchored_match(self, value_expr: str, pattern: str) -> str:
81
+ """Boolean: equivalent to Python `re.match(pattern, value)` —
82
+ anchored at position 0, may be a prefix of the value, not a
83
+ full-match.
84
+
85
+ `pattern` is the raw Python regex string; the dialect handles
86
+ its own escaping into a SQL literal, since regex flavours
87
+ differ in how backslashes are treated."""
88
+ ...
89
+
90
+ def regexp_nth_digit_run(self, value_expr: str, n: int) -> str:
91
+ """Extract the n-th sequence of digits from `value_expr`. Returns NULL
92
+ if there are fewer than n digit runs. Used for semver."""
93
+ ...
94
+
95
+ # --- hashing primitives for PERCENTAGE_SPLIT ---
96
+
97
+ def md5_hex(self, expr: str) -> str:
98
+ """SQL fragment producing the lowercase 32-char hex MD5 digest."""
99
+ ...
100
+
101
+ def parse_hex_chunk(self, hex_expr: str, start: int, length: int = 8) -> str:
102
+ """Parse `length` hex characters of `hex_expr` starting at 1-indexed
103
+ `start` into a non-negative integer."""
104
+ ...
105
+
106
+ # --- type casts ---
107
+
108
+ def cast_string(self, expr: str) -> str:
109
+ """Cast `expr` to STRING / VARCHAR."""
110
+ ...
111
+
112
+ def cast_float(self, expr: str) -> str:
113
+ """Cast `expr` to a 64-bit float / DOUBLE."""
114
+ ...
115
+
116
+ def cast_number(self, expr: str) -> str:
117
+ """Cast `expr` to a NUMBER / BIGINT — the engine-side numeric
118
+ type used for modulo arithmetic."""
119
+ ...
120
+
121
+ # --- composition ---
122
+
123
+ def mod(self, dividend: str, divisor: str) -> str:
124
+ """`dividend MOD divisor` returning a numeric value."""
125
+ ...
@@ -0,0 +1,5 @@
1
+ """Dialect implementations."""
2
+
3
+ from flagsmith_sql_flag_engine.dialects.clickhouse import ClickHouseDialect
4
+
5
+ __all__ = ["ClickHouseDialect"]
@@ -0,0 +1,329 @@
1
+ """ClickHouse dialect: SQL fragments tailored to ClickHouse's function set.
2
+
3
+ ## Expected schema
4
+
5
+ The translator emits predicates against a single `IDENTITIES` table —
6
+ four typed columns `environment_id`, `id`, `identifier`, `identity_key`,
7
+ plus one `JSON` column `traits` holding the identity's full trait map
8
+ in ClickHouse's native columnar JSON layout. Trait keys are JSON paths
9
+ on the column, not schema columns.
10
+
11
+ The `JSON` type was chosen over `Nullable(String)` + `JSONExtract*`
12
+ because:
13
+
14
+ - It stores each path as a typed subcolumn, so trait reads are a
15
+ direct columnar scan — no per-row JSON parse. Empirically: at 870M
16
+ rows on a Cloud trial, simple/multi predicates dropped from 14-20×
17
+ slower than Snowflake VARIANT to within 2.5-4×. The wide-String
18
+ variant scales linearly with row count where Snowflake / `JSON`
19
+ stay near-flat.
20
+ - Schema evolution is implicit: new trait keys appear as new
21
+ subcolumns at INSERT time, no DDL change.
22
+ - It matches Snowflake `VARIANT`'s semantic model — same NULL-on-miss
23
+ behaviour, same type discrimination, same path syntax cost shape.
24
+
25
+ The trade-off is that ClickHouse caps `max_dynamic_paths` per JSON
26
+ column (default 1024). Above that, additional paths spill into a
27
+ `Dynamic` catch-all and lose the columnar fast path. This is fine for
28
+ typical Flagsmith trait vocabularies; we should monitor.
29
+
30
+ ## Notable choices
31
+
32
+ - Subcolumn access uses backtick-quoted identifiers: ``i.traits.`key` ``.
33
+ Backticks are doubled to escape; arbitrary trait keys including
34
+ spaces and dots are supported. CH's `getSubcolumn(json, 'key')`
35
+ function works but doesn't compose with the typed-variant `.:Type`
36
+ accessor, so we standardise on backtick form everywhere.
37
+
38
+ - `trait_path` returns the trait's canonical string form via
39
+ `toString(<sub>)`, with a leading `IS NULL` guard so missing keys
40
+ and JSON null surface as SQL NULL. Mirrors Snowflake's `::STRING`
41
+ semantics — downstream regex / position / compare paths get
42
+ unquoted strings, decimal digits for numerics, and `'true'` /
43
+ `'false'` for bools.
44
+
45
+ - `trait_eq` (positive) leads with a `toString(<sub>) = <lit>` fast
46
+ path — covers String + canonical-stringified Int / UInt / Float +
47
+ lowercase Bool in one subcolumn read. A typed-variant Bool branch
48
+ (``<sub>.:Bool = <target>``) picks up Python-bool-repr "True" /
49
+ "False" coercions, and a `toFloat64OrNull(toString(<sub>))` branch
50
+ catches floats whose canonical toString integer-trims (1.0 → '1').
51
+ Mirrors Snowflake's `v::STRING` fast path. `NOT_EQUAL` still does
52
+ explicit per-type dispatch via typed-variant subcolumns
53
+ (``.:String``, ``.:Int64``, ``.:UInt64``, ``.:Float64``, ``.:Bool``);
54
+ each accessor is NULL when the JSON value is the wrong type, which
55
+ matches the engine's "cast failed → False" semantics.
56
+
57
+ - Anchored regex uses `match(value, '^(...)')` — ClickHouse's `match`
58
+ is RE2 and unanchored, so we prepend `^` to mirror Python's
59
+ `re.match` (start-anchored, prefix-allowed, not full-match).
60
+
61
+ - n-th digit run uses `extractAll(value, '\\d+')[n]`; ClickHouse's
62
+ array subscript is 1-indexed and returns `''` for out-of-bounds, so
63
+ we `nullIf(..., '')` to keep the engine's "no n-th run" → NULL
64
+ contract.
65
+
66
+ - Hex-chunk parsing reads directly from the raw 16-byte MD5 output
67
+ rather than round-tripping through hex. `MD5(expr)` returns a
68
+ `FixedString(16)`; `reinterpretAsUInt32(reverse(substring(...)))`
69
+ pulls a big-endian UInt32 out of any 4-byte slice. Skipping the
70
+ `hex(MD5(...))` → `unhex(substring(...))` round-trip is a small but
71
+ consistent speedup on `% Split`-heavy predicates.
72
+
73
+ ## Setup
74
+
75
+ `JSON` type DDL requires `SET allow_experimental_json_type = 1` on
76
+ ClickHouse Cloud as of 25.12 (no longer experimental on OSS 25.x).
77
+ Callers should apply this setting at session creation."""
78
+
79
+ from flagsmith_sql_flag_engine.utils import re2_safe, string_literal
80
+
81
+ SCHEMA_DDL = """\
82
+ CREATE TABLE IF NOT EXISTS IDENTITIES (
83
+ -- environment.key from EnvironmentContext; used as the env partition
84
+ environment_id String,
85
+
86
+ -- stable per-identity row id
87
+ id UInt64,
88
+
89
+ -- the identity's external identifier, exposed as $.identity.identifier
90
+ identifier String,
91
+
92
+ -- the composite identity key, exposed as $.identity.key
93
+ identity_key String,
94
+
95
+ -- the identity's full trait map. ClickHouse's `JSON` type stores each
96
+ -- path as a typed subcolumn so trait lookups are columnar reads, not
97
+ -- per-row JSON parses. SQL NULL for an identity with no traits.
98
+ traits JSON
99
+ )
100
+ ENGINE = MergeTree()
101
+ ORDER BY (environment_id, id);
102
+ """
103
+
104
+
105
+ def _backtick(trait_key: str) -> str:
106
+ """Escape a trait key for use as a backtick-quoted JSON subcolumn name.
107
+ Doubles embedded backticks per CH's identifier escape rule."""
108
+ return "`" + trait_key.replace("`", "``") + "`"
109
+
110
+
111
+ def _non_null(expr: str) -> str:
112
+ """Coerce a possibly-`Nullable(String)` expression down to non-nullable
113
+ `String`. ClickHouse rejects regex functions (`match`, `extractAll`)
114
+ over `Nullable(String)` because the inferred result types
115
+ `Nullable(UInt8)` / `Nullable(Array(String))` aren't representable.
116
+ The translator always guards these calls with `IS NOT NULL`, so the
117
+ coalesce default is unreachable at runtime."""
118
+ return f"ifNull({expr}, '')"
119
+
120
+
121
+ class ClickHouseDialect:
122
+ name = "clickhouse"
123
+ schema_ddl = SCHEMA_DDL
124
+
125
+ # ----- IDENTITIES schema access -----
126
+
127
+ def identifier_expr(self, alias: str) -> str:
128
+ return f"{alias}.identifier"
129
+
130
+ def identity_key_expr(self, alias: str) -> str:
131
+ return f"{alias}.identity_key"
132
+
133
+ def _sub(self, alias: str, trait_key: str) -> str:
134
+ """The raw JSON subcolumn reference for a trait key.
135
+ ``alias.traits.`key` `` — Dynamic-typed, NULL for missing keys
136
+ and explicit JSON null."""
137
+ return f"{alias}.traits.{_backtick(trait_key)}"
138
+
139
+ def trait_path(self, alias: str, trait_key: str) -> str:
140
+ # Return the trait's canonical string form, mirroring Snowflake's
141
+ # `i.traits:"key"::STRING`:
142
+ #
143
+ # - missing key → NULL
144
+ # - JSON null value → NULL
145
+ # - JSON string "x" → 'x' (quotes stripped)
146
+ # - JSON int / float → '42' / '3.14'
147
+ # - JSON true / false → 'true' / 'false'
148
+ #
149
+ # `toString` over a JSON subcolumn does the right canonicalisation
150
+ # natively. The `IS NULL` guard distinguishes missing from a
151
+ # JSON empty string (`""` round-trips as `''` through toString,
152
+ # the same value `toString(NULL)` produces) — the translator's
153
+ # `IS NULL` / `IS NOT NULL` checks rely on this distinction.
154
+ sub = self._sub(alias, trait_key)
155
+ return f"if({sub} IS NULL, NULL, toString({sub}))"
156
+
157
+ def trait_eq(self, alias: str, trait_key: str, value: object, negate: bool) -> str:
158
+ sub = self._sub(alias, trait_key)
159
+ str_value = str(value)
160
+ str_lit = string_literal(str_value)
161
+ # Engine bool cast: `v not in ("False", "false")`. A JSON true matches
162
+ # every segment value except literal "False" / "false"; those two coerce
163
+ # to False and match a JSON false.
164
+ bool_target = "true" if str_value not in ("False", "false") else "false"
165
+ # Engine int / float cast: ValueError → no match for that branch.
166
+ try:
167
+ int_lit: str | None = str(int(str_value))
168
+ except (ValueError, TypeError):
169
+ int_lit = None
170
+ try:
171
+ float_lit: str | None = repr(float(str_value))
172
+ except (ValueError, TypeError):
173
+ float_lit = None
174
+
175
+ # `toString(<sub>)` returns the JSON value's canonical string form
176
+ # in a single subcolumn read — 'x' for String, '42' for Int / UInt,
177
+ # '3.14' for Float, 'true' / 'false' for Bool. Mirrors Snowflake's
178
+ # `v::STRING` and lets us collapse the typical match path to one
179
+ # comparison instead of an OR across five typed-variant subcolumns.
180
+ str_path = f"toString({sub})"
181
+ bool_sub = f"{sub}.:Bool"
182
+
183
+ if not negate:
184
+ # Fast path: covers String + canonical-stringified Int / UInt /
185
+ # Float + lowercase Bool ('true' / 'false') in one branch.
186
+ clauses = [f"({str_path} = {str_lit})"]
187
+ # Bool branch: engine treats any segment value except "False" /
188
+ # "false" as bool True, so a JSON true trait must match e.g.
189
+ # `EQUAL("flag", "growth")`. The fast path catches the
190
+ # lowercase case; this branch picks up Python-bool-repr "True"
191
+ # / "False" and any other coercion that doesn't string-match
192
+ # 'true' / 'false' directly.
193
+ clauses.append(f"({bool_sub} = {bool_target})")
194
+ # Float branch: floats whose `toString` integer-trims (1.0 →
195
+ # '1') miss the fast path against a `'1.0'` segment value.
196
+ # `toFloat64OrNull(str_path)` covers Int / UInt / Float
197
+ # uniformly; non-numeric traits stringify to something
198
+ # `toFloat64OrNull` rejects → NULL → no match.
199
+ if float_lit is not None and float_lit != str_value:
200
+ clauses.append(f"(toFloat64OrNull({str_path}) = {float_lit})")
201
+ return "(" + " OR ".join(clauses) + ")"
202
+
203
+ # NOT_EQUAL: per-type dispatch. Engine returns True only when the
204
+ # cast succeeded *and* values differ. `.:Type IS NOT NULL AND .:Type
205
+ # <> lit` encodes that directly; types where the segment value can't
206
+ # cast contribute FALSE.
207
+ no_match = "FALSE"
208
+ str_sub = f"{sub}.:String"
209
+ int_sub = f"{sub}.:Int64"
210
+ uint_sub = f"{sub}.:UInt64"
211
+ float_sub = f"{sub}.:Float64"
212
+ bool_branch = f"({bool_sub} IS NOT NULL AND {bool_sub} <> {bool_target})"
213
+ if int_lit is not None or float_lit is not None:
214
+ num_lit = int_lit if int_lit is not None else float_lit
215
+ num_branch = (
216
+ f"(({int_sub} IS NOT NULL AND {int_sub} <> {num_lit})"
217
+ f" OR ({uint_sub} IS NOT NULL AND {uint_sub} <> {num_lit})"
218
+ f" OR ({float_sub} IS NOT NULL AND {float_sub} <> {num_lit}))"
219
+ )
220
+ else:
221
+ num_branch = no_match
222
+ return (
223
+ f"(({str_sub} IS NOT NULL AND {str_sub} <> {str_lit}) OR {bool_branch} OR {num_branch})"
224
+ )
225
+
226
+ def trait_in(self, alias: str, trait_key: str, items: list[str]) -> str:
227
+ # `toString(<sub>)` returns the canonical string form for any JSON
228
+ # value type in a single subcolumn read. Engine semantics only
229
+ # match String and integer trait types — bool / float / array
230
+ # traits never match — so we gate the toString-based IN compare on
231
+ # `.:Bool IS NULL AND .:Float64 IS NULL`. Int / UInt traits pass
232
+ # because their stringified form ('42') matches the item literals;
233
+ # missing keys propagate NULL through toString and fail the IN.
234
+ sub = self._sub(alias, trait_key)
235
+ bool_sub = f"{sub}.:Bool"
236
+ float_sub = f"{sub}.:Float64"
237
+ str_path = f"toString({sub})"
238
+ item_lits = ",".join(string_literal(v) for v in items)
239
+ return f"({bool_sub} IS NULL AND {float_sub} IS NULL AND {str_path} IN ({item_lits}))"
240
+
241
+ # ----- string operations -----
242
+
243
+ def position(self, needle_lit: str, haystack_expr: str) -> str:
244
+ # ClickHouse's argument order is (haystack, needle), opposite of
245
+ # Snowflake's POSITION(needle, haystack). Returns 1-indexed
246
+ # position, 0 for not-found.
247
+ return f"position({haystack_expr}, {needle_lit}) > 0"
248
+
249
+ def lpad(self, expr: str, width: int, pad_lit: str) -> str:
250
+ return f"leftPad({expr}, {width}, {pad_lit})"
251
+
252
+ def coalesce(self, *exprs: str) -> str:
253
+ return f"coalesce({', '.join(exprs)})"
254
+
255
+ # ----- regex -----
256
+
257
+ def regex_supports(self, pattern: str) -> bool:
258
+ # ClickHouse's regex engine is RE2 (`match`, `extractAll`).
259
+ return re2_safe(pattern)
260
+
261
+ @staticmethod
262
+ def _regex_literal(pattern: str) -> str:
263
+ # ClickHouse string literals process `\` as an escape, so a SQL
264
+ # `'\d'` reaches the regex engine as `d`. Double the backslashes so
265
+ # the engine sees `\d`; SQL single quotes are escaped by doubling
266
+ # per the SQL standard.
267
+ doubled = pattern.replace("\\", "\\\\").replace("'", "''")
268
+ return f"'{doubled}'"
269
+
270
+ def regexp_anchored_match(self, value_expr: str, pattern: str) -> str:
271
+ # `match` is RE2 but unanchored — equivalent to `re.search`. Prepend
272
+ # `^` to get `re.match` semantics (start-anchored, prefix-allowed).
273
+ # Wrapping in `(...)` keeps the user's top-level alternation from
274
+ # binding tighter than the anchor.
275
+ anchored = "^(" + pattern + ")"
276
+ return f"match({_non_null(value_expr)}, {self._regex_literal(anchored)})"
277
+
278
+ def regexp_nth_digit_run(self, value_expr: str, n: int) -> str:
279
+ # `extractAll` returns the matches array; subscript is 1-indexed
280
+ # and yields `''` past the end. `nullIf` collapses that to NULL so
281
+ # `COALESCE` upstream can fall back to `'0'`. `ifNull` coerces a
282
+ # `Nullable(String)` input down to `String` — ClickHouse refuses
283
+ # `extractAll` on `Nullable(String)` because the inferred result
284
+ # type `Nullable(Array(String))` is unrepresentable.
285
+ digit_run = self._regex_literal("\\d+")
286
+ return f"nullIf(extractAll({_non_null(value_expr)}, {digit_run})[{n}], '')"
287
+
288
+ # ----- hashing -----
289
+
290
+ def md5_hex(self, expr: str) -> str:
291
+ # Return the raw 16-byte MD5 digest rather than the hex string.
292
+ # `parse_hex_chunk` below reads bytes directly via
293
+ # `reinterpretAsUInt32(reverse(substring(...)))`, skipping the
294
+ # `hex` → `unhex` round-trip — small but consistent win on
295
+ # PERCENTAGE_SPLIT-heavy predicates.
296
+ return f"MD5({expr})"
297
+
298
+ def parse_hex_chunk(self, hex_expr: str, start: int, length: int = 8) -> str:
299
+ # `hex_expr` is the raw `FixedString(16)` from `md5_hex` (not a hex
300
+ # string). Map the 1-indexed hex start position to a 1-indexed byte
301
+ # position: hex 1 → byte 1, hex 9 → byte 5, hex 17 → byte 9,
302
+ # hex 25 → byte 13. 8 hex chars = 4 raw bytes.
303
+ byte_start = (start - 1) // 2 + 1
304
+ byte_length = length // 2
305
+ slice_expr = f"substring({hex_expr}, {byte_start}, {byte_length})"
306
+ # `reinterpretAsUInt32` reads bytes little-endian; `reverse` first
307
+ # so the value equals `int(hex_chars, 16)` for the corresponding
308
+ # hex slice — preserves `_HASH_CONST_*` constants from the translator.
309
+ return f"reinterpretAsUInt32(reverse({slice_expr}))"
310
+
311
+ # ----- casts -----
312
+
313
+ def cast_string(self, expr: str) -> str:
314
+ return f"toString({expr})"
315
+
316
+ def cast_float(self, expr: str) -> str:
317
+ # `toFloat64OrNull` over the string form sidesteps `toFloat64`'s
318
+ # exception on a non-numeric input — engine behaviour on a cast
319
+ # failure is "doesn't match", which NULL propagation through the
320
+ # surrounding predicate gives us.
321
+ return f"toFloat64OrNull(toString({expr}))"
322
+
323
+ def cast_number(self, expr: str) -> str:
324
+ return f"toInt64OrNull(toString({expr}))"
325
+
326
+ # ----- composition -----
327
+
328
+ def mod(self, dividend: str, divisor: str) -> str:
329
+ return f"modulo({dividend}, {divisor})"
File without changes
@@ -0,0 +1,569 @@
1
+ """Translate `SegmentContext` predicate trees into SQL `WHERE` expressions.
2
+
3
+ Output drops into:
4
+
5
+ SELECT ... FROM IDENTITIES i
6
+ WHERE i.environment_id = '<env-key>' AND <translator output>
7
+
8
+ Returns `None` if any condition uses an operator the active dialect
9
+ can't translate — callers fall back to `flag_engine.is_context_in_segment`.
10
+ """
11
+
12
+ import json
13
+ from typing import Literal, NamedTuple
14
+
15
+ import jsonpath_rfc9535
16
+ from flag_engine.context.types import (
17
+ EvaluationContext,
18
+ SegmentCondition,
19
+ SegmentContext,
20
+ SegmentRule,
21
+ )
22
+ from flag_engine.segments.evaluator import is_context_in_segment
23
+ from flag_engine.segments.types import ConditionOperator
24
+
25
+ from flagsmith_sql_flag_engine.dialect import Dialect
26
+ from flagsmith_sql_flag_engine.utils import (
27
+ escape_string,
28
+ modulo_literal,
29
+ numeric_literal,
30
+ string_literal,
31
+ )
32
+
33
+ TRANSLATABLE_OPERATORS: frozenset[ConditionOperator] = frozenset(
34
+ {
35
+ "EQUAL",
36
+ "NOT_EQUAL",
37
+ "IN",
38
+ "IS_SET",
39
+ "IS_NOT_SET",
40
+ "CONTAINS",
41
+ "NOT_CONTAINS",
42
+ "GREATER_THAN",
43
+ "LESS_THAN",
44
+ "GREATER_THAN_INCLUSIVE",
45
+ "LESS_THAN_INCLUSIVE",
46
+ "MODULO",
47
+ "PERCENTAGE_SPLIT",
48
+ "REGEX",
49
+ }
50
+ )
51
+
52
+
53
+ # Constants for chunked MD5-mod-9999 hash. The engine computes
54
+ # `int(md5_hex, 16) % 9999`; we split the 32-hex digest into four 8-hex
55
+ # chunks, parse each as a 32-bit int, and combine via modular arithmetic.
56
+ # Constants are (16^24, 16^16, 16^8) mod 9999, precomputed.
57
+ _HASH_CONST_HIGH = 7291 # 16^24 mod 9999
58
+ _HASH_CONST_MID = 1897 # 16^16 mod 9999
59
+ _HASH_CONST_LOW = 6835 # 16^8 mod 9999
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Context: shape information the translator needs to produce correct refs.
64
+ # ---------------------------------------------------------------------------
65
+
66
+
67
+ class TranslateContext:
68
+ """Inputs the translator needs to produce a query for a specific shape.
69
+
70
+ `evaluation_context` is a flag_engine `EvaluationContext`. Its
71
+ `identity` field is ignored since identity values come from each
72
+ `IDENTITIES` row at SQL execution time. `dialect` is an
73
+ implementation of the `Dialect` protocol; it owns the IDENTITIES
74
+ schema, so column references come from dialect methods rather than
75
+ being configured here. `identities_alias` is the table alias for
76
+ `IDENTITIES` in the surrounding query — defaults to `i`.
77
+ `segment_key` salts `PERCENTAGE_SPLIT` and is auto-injected from
78
+ the segment's `key` field by `translate_segment`.
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ evaluation_context: EvaluationContext,
84
+ dialect: Dialect,
85
+ identities_alias: str = "i",
86
+ segment_key: str | None = None,
87
+ ) -> None:
88
+ self.evaluation_context = evaluation_context
89
+ self.dialect = dialect
90
+ self.identities_alias = identities_alias
91
+ self.segment_key = segment_key
92
+
93
+ @property
94
+ def identity_key_expr(self) -> str:
95
+ return self.dialect.identity_key_expr(self.identities_alias)
96
+
97
+ def trait_path(self, trait_key: str) -> str:
98
+ """Dialect-specific path-extraction for a trait value."""
99
+ return self.dialect.trait_path(self.identities_alias, trait_key)
100
+
101
+ def jsonpath_expr(self, prop: Literal["$.identity.identifier", "$.identity.key"]) -> str:
102
+ # Only the row-bound identity columns need an SQL expression — every
103
+ # other JSONPath property is resolved against the eval context up in
104
+ # `translate_condition` via `_engine_static_verdict`.
105
+ match prop:
106
+ case "$.identity.identifier":
107
+ return self.dialect.identifier_expr(self.identities_alias)
108
+ case "$.identity.key":
109
+ return self.dialect.identity_key_expr(self.identities_alias)
110
+
111
+ def with_segment_key(self, key: str) -> "TranslateContext":
112
+ return TranslateContext(
113
+ evaluation_context=self.evaluation_context,
114
+ dialect=self.dialect,
115
+ identities_alias=self.identities_alias,
116
+ segment_key=key,
117
+ )
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Inline SQL builders for hash-based and version-based predicates.
122
+ # ---------------------------------------------------------------------------
123
+
124
+
125
+ def _percentage_split_expr(
126
+ ctx: TranslateContext, seg_key: str, ctx_value_sql: str, threshold: float
127
+ ) -> str:
128
+ """Boolean SQL fragment: hash(seg_key + "," + value) <= threshold.
129
+
130
+ Mirrors `flag_engine.utils.hashing.get_hashed_percentage_for_object_ids`
131
+ via four 8-hex-char chunks combined modulo 9999. Diverges from the
132
+ engine on the ~1/9999 inputs where the bare hash mod 9999 == 9998 —
133
+ the engine recurses with doubled input; we don't.
134
+ """
135
+ d = ctx.dialect
136
+ seg_lit = string_literal(seg_key)
137
+ hash_subject = f"{seg_lit} || ',' || ({ctx_value_sql})"
138
+ h = d.md5_hex(hash_subject)
139
+ s1 = d.parse_hex_chunk(h, 1)
140
+ s2 = d.parse_hex_chunk(h, 9)
141
+ s3 = d.parse_hex_chunk(h, 17)
142
+ s4 = d.parse_hex_chunk(h, 25)
143
+ weighted = (
144
+ f"{s1} * {_HASH_CONST_HIGH} + {s2} * {_HASH_CONST_MID} + {s3} * {_HASH_CONST_LOW} + {s4}"
145
+ )
146
+ return f"({d.mod(weighted, '9999')} / 9998.0 * 100.0 <= {float(threshold)})"
147
+
148
+
149
+ def _semver_sort_key_expr(ctx: TranslateContext, value_sql: str) -> str:
150
+ """Sortable padded major.minor.patch key. String-comparing two outputs of
151
+ this gives the engine's GT/GTE/LT/LTE/EQ/NE result for the
152
+ major.minor.patch portion. Prerelease is ignored."""
153
+ d = ctx.dialect
154
+ parts = [
155
+ d.lpad(d.coalesce(d.regexp_nth_digit_run(value_sql, n), "'0'"), 10, "'0'")
156
+ for n in (1, 2, 3)
157
+ ]
158
+ return f"({parts[0]} || '.' || {parts[1]} || '.' || {parts[2]})"
159
+
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Trait-bound and direct comparisons. Both go against IDENTITIES alias `i`
163
+ # directly: trait conditions read `i."<trait>"`, JSONPath conditions read
164
+ # the appropriate identity column or env literal.
165
+ # ---------------------------------------------------------------------------
166
+
167
+
168
+ JsonpathKind = Literal[
169
+ "identifier",
170
+ "key",
171
+ "trait",
172
+ "identity_object",
173
+ "untranslatable",
174
+ "static",
175
+ ]
176
+
177
+
178
+ class JsonpathClassification(NamedTuple):
179
+ """What a JSONPath property resolves to in the SQL setting.
180
+
181
+ `kind` selects the shape; `trait_key` carries the trait name only when
182
+ `kind == "trait"`.
183
+ """
184
+
185
+ kind: JsonpathKind
186
+ trait_key: str | None = None
187
+
188
+
189
+ def _classify_jsonpath(prop: str) -> JsonpathClassification:
190
+ """Classify a JSONPath property by what it resolves to in the SQL setting.
191
+
192
+ Identity is per-row in our query model — each `IDENTITIES` row IS an
193
+ identity — but the engine treats `$.identity.*` as a lookup against
194
+ the eval-context identity. Most identity-bound paths therefore need
195
+ to map to a row reference, not be statically pre-computed against
196
+ the eval context.
197
+
198
+ A `prop` that doesn't parse as JSONPath classifies as a trait keyed
199
+ by the prop string itself — the engine treats unparseable `$.`-
200
+ prefixed properties as literal trait keys, and we mirror that.
201
+ """
202
+ try:
203
+ compiled = jsonpath_rfc9535.compile(prop)
204
+ except jsonpath_rfc9535.JSONPathSyntaxError:
205
+ return JsonpathClassification("trait", prop)
206
+ names: list[str] = []
207
+ for s in compiled.segments:
208
+ if len(s.selectors) != 1: # pragma: no cover - multi-selector segments not in dataset
209
+ break
210
+ name = getattr(s.selectors[0], "name", None)
211
+ if name is None:
212
+ break
213
+ names.append(name)
214
+ else:
215
+ if names and names[0] == "identity":
216
+ if len(names) == 1:
217
+ # `$.identity` — the whole identity object. Every row in
218
+ # the IDENTITIES table IS an identity by construction,
219
+ # so we don't go through the eval context — which may or
220
+ # may not carry an identity, depending on caller. The
221
+ # translator encodes the row-truth directly: IS_SET →
222
+ # TRUE, IS_NOT_SET → FALSE, scalar comparators → FALSE,
223
+ # mirroring the engine's fail-cast on a dict.
224
+ return JsonpathClassification("identity_object")
225
+ if len(names) == 2 and names[1] == "identifier":
226
+ return JsonpathClassification("identifier")
227
+ if len(names) == 2 and names[1] == "key":
228
+ return JsonpathClassification("key")
229
+ if len(names) == 3 and names[1] == "traits":
230
+ return JsonpathClassification("trait", names[2])
231
+ return JsonpathClassification("untranslatable")
232
+ if names and names[0] == "identity":
233
+ # Identity path with non-name selectors — wildcards, filters,
234
+ # etc. — we can't map those to fixed row references.
235
+ return JsonpathClassification("untranslatable")
236
+ return JsonpathClassification("static")
237
+
238
+
239
+ def _engine_static_verdict(ctx: TranslateContext, cond: SegmentCondition) -> str:
240
+ """Run a single condition through `is_context_in_segment` against the
241
+ eval context and emit `'TRUE'`/`'FALSE'`. Used for JSONPath conditions
242
+ that don't reference row-bound state — the verdict is the same for
243
+ every row in the resulting query, so we collapse it now."""
244
+ fake_segment: SegmentContext = {
245
+ "key": ctx.segment_key or "_static",
246
+ "name": "_static",
247
+ "rules": [{"type": "ALL", "conditions": [cond]}],
248
+ }
249
+ matches = is_context_in_segment(ctx.evaluation_context, fake_segment)
250
+ return "TRUE" if matches else "FALSE"
251
+
252
+
253
+ def _engine_in_values(value: object) -> list[str] | None:
254
+ """Mirror `flag_engine.segments.evaluator._get_in_values`: parse a segment
255
+ value into a list of candidate strings. Returns None for inputs the
256
+ engine doesn't accept — anything that's neither a string nor a list."""
257
+ if isinstance(value, list):
258
+ return [v if isinstance(v, str) else str(v) for v in value]
259
+ if not isinstance(value, str):
260
+ return None
261
+ if value.startswith("["):
262
+ try:
263
+ parsed = json.loads(value)
264
+ except (ValueError, TypeError):
265
+ return value.split(",")
266
+ if isinstance(parsed, list): # pragma: no branch - `[`-prefixed valid JSON parses as a list
267
+ return [v if isinstance(v, str) else str(v) for v in parsed]
268
+ return value.split(",")
269
+
270
+
271
+ def _comparison(
272
+ ctx: TranslateContext,
273
+ op: str,
274
+ expr: str,
275
+ value: object,
276
+ is_jsonpath: bool = False,
277
+ ) -> str | None:
278
+ """Emit a SQL fragment comparing `expr` against `value` per `op`.
279
+
280
+ Used for both trait references — cast via the dialect as needed —
281
+ and JSONPath references, which arrive as already-typed columns or
282
+ string literals.
283
+
284
+ Returns `None` only for genuinely untranslatable inputs such as a
285
+ REGEX pattern the active dialect's regex flavour can't compile.
286
+ Inputs the engine evaluates to a deterministic False — missing
287
+ value, non-numeric operand on a comparator — compile to `"FALSE"`.
288
+ """
289
+ if value is None:
290
+ return "FALSE"
291
+ d = ctx.dialect
292
+ lit = string_literal(str(value))
293
+ str_expr = expr if is_jsonpath else d.cast_string(expr)
294
+ if op == "EQUAL":
295
+ return f"{str_expr} = {lit}"
296
+ if op == "NOT_EQUAL":
297
+ return f"{str_expr} <> {lit}"
298
+ if op == "IN":
299
+ items = "','".join(escape_string(v.strip()) for v in str(value).split(","))
300
+ return f"{str_expr} IN ('{items}')"
301
+ if op == "CONTAINS":
302
+ return d.position(lit, str_expr)
303
+ if op == "NOT_CONTAINS":
304
+ return f"({expr} IS NOT NULL AND NOT ({d.position(lit, str_expr)}))"
305
+ if op in {"GREATER_THAN", "LESS_THAN", "GREATER_THAN_INCLUSIVE", "LESS_THAN_INCLUSIVE"}:
306
+ numeric_lit = numeric_literal(value)
307
+ if numeric_lit is None:
308
+ # Engine: float() on a non-numeric operand raises → returns False.
309
+ return "FALSE"
310
+ sql_op = {
311
+ "GREATER_THAN": ">",
312
+ "LESS_THAN": "<",
313
+ "GREATER_THAN_INCLUSIVE": ">=",
314
+ "LESS_THAN_INCLUSIVE": "<=",
315
+ }[op]
316
+ return f"({expr} IS NOT NULL AND {d.cast_float(expr)} {sql_op} {numeric_lit})"
317
+ if op == "MODULO":
318
+ parsed = modulo_literal(value)
319
+ if parsed is None:
320
+ # Bad operand — empty string, missing separator, non-numeric
321
+ # side. Engine catches the cast error and returns False.
322
+ return "FALSE"
323
+ divisor_lit, remainder_lit = parsed
324
+ mod_expr = d.mod(d.cast_number(expr), divisor_lit)
325
+ return f"({expr} IS NOT NULL AND ({mod_expr}) = {remainder_lit})"
326
+ if op == "REGEX":
327
+ pattern = str(value)
328
+ if not d.regex_supports(pattern):
329
+ return None
330
+ return f"({expr} IS NOT NULL AND {d.regexp_anchored_match(str_expr, pattern)})"
331
+ raise AssertionError( # pragma: no cover - all TRANSLATABLE_OPERATORS handled above
332
+ f"unhandled translatable operator in _comparison: {op}"
333
+ )
334
+
335
+
336
+ # ---------------------------------------------------------------------------
337
+ # Condition translation: routes the operator to the right SQL shape.
338
+ # ---------------------------------------------------------------------------
339
+
340
+
341
+ _SEMVER_OPS = {
342
+ "EQUAL": "=",
343
+ "NOT_EQUAL": "<>",
344
+ "GREATER_THAN": ">",
345
+ "LESS_THAN": "<",
346
+ "GREATER_THAN_INCLUSIVE": ">=",
347
+ "LESS_THAN_INCLUSIVE": "<=",
348
+ }
349
+
350
+
351
+ def _translate_trait_op(
352
+ ctx: TranslateContext,
353
+ trait_key: str,
354
+ op: ConditionOperator,
355
+ val: object,
356
+ ) -> str | None:
357
+ """Translate `op` on a literal trait key into SQL. Returns `None`
358
+ for inputs the translator can't compile, such as a REGEX pattern
359
+ the active dialect rejects."""
360
+ path = ctx.trait_path(trait_key)
361
+ if op == "IS_SET":
362
+ return f"{path} IS NOT NULL"
363
+ if op == "IS_NOT_SET":
364
+ return f"{path} IS NULL"
365
+
366
+ # Semver-marked comparator — the segment value ends with `:semver`.
367
+ # Engine only invokes its semver path for the comparators below;
368
+ # other operators treat the `:semver` suffix as ordinary string
369
+ # content, which is what the fall-through handlers already do.
370
+ if isinstance(val, str) and val.endswith(":semver") and op in _SEMVER_OPS:
371
+ bare = val[:-7]
372
+ bare_lit = string_literal(bare)
373
+ col_str = ctx.dialect.cast_string(path)
374
+ return (
375
+ f"({path} IS NOT NULL AND "
376
+ f"{_semver_sort_key_expr(ctx, col_str)} {_SEMVER_OPS[op]} "
377
+ f"{_semver_sort_key_expr(ctx, bare_lit)})"
378
+ )
379
+
380
+ # Type-aware comparators on traits — delegate to the dialect. The
381
+ # discriminator funcs like TYPEOF / IS_*, runtime type-coercion
382
+ # casts, and short-circuit pitfalls are all engine-specific.
383
+ if op in {"EQUAL", "NOT_EQUAL"} and val is not None:
384
+ negate = op == "NOT_EQUAL"
385
+ eq_pred = ctx.dialect.trait_eq(ctx.identities_alias, trait_key, val, negate=negate)
386
+ return f"({path} IS NOT NULL AND {eq_pred})"
387
+ if op == "IN":
388
+ items = _engine_in_values(val)
389
+ if items is None:
390
+ # Bad IN value — neither a string nor a list. Engine returns
391
+ # False.
392
+ return "FALSE"
393
+ in_pred = ctx.dialect.trait_in(ctx.identities_alias, trait_key, items)
394
+ return f"({path} IS NOT NULL AND {in_pred})"
395
+
396
+ return _comparison(ctx, op, path, val, is_jsonpath=False)
397
+
398
+
399
+ def translate_condition(cond: SegmentCondition, ctx: TranslateContext) -> str | None:
400
+ op = cond["operator"]
401
+ if op not in TRANSLATABLE_OPERATORS:
402
+ return None
403
+
404
+ prop = cond.get("property") or ""
405
+ val = cond.get("value")
406
+
407
+ # Classify the property up front. Identity-bound JSONPaths —
408
+ # `$.identity.identifier`, `$.identity.key`, `$.identity.traits.<x>` —
409
+ # map to row references; non-identity JSONPaths are eval-ctx-bound,
410
+ # constant for every row, and get pre-computed via the engine. Bare
411
+ # trait keys bypass the JSONPath compile — they're classified as a
412
+ # literal trait lookup directly.
413
+ classification = (
414
+ _classify_jsonpath(prop) if prop.startswith("$.") else JsonpathClassification("trait", prop)
415
+ )
416
+ if classification.kind == "trait":
417
+ # Trait keys carried via `$.identity.traits.<x>` arrive normalised
418
+ # to the bare key; literal trait keys come through untouched.
419
+ assert classification.trait_key is not None
420
+ prop = classification.trait_key
421
+
422
+ # PERCENTAGE_SPLIT — inline pure-SQL hash.
423
+ if op == "PERCENTAGE_SPLIT":
424
+ # `translate_segment` always injects `segment_key` from the segment
425
+ # before recursing; reaching here without one means a caller invoked
426
+ # `translate_condition` directly with a half-formed context.
427
+ assert ctx.segment_key is not None, (
428
+ "PERCENTAGE_SPLIT requires a segment_key as the hash salt"
429
+ )
430
+ threshold_lit = numeric_literal(val)
431
+ if threshold_lit is None:
432
+ # Engine: float() on the threshold raises → returns False.
433
+ return "FALSE"
434
+ threshold = float(threshold_lit)
435
+ identity: dict[str, object] = ctx.evaluation_context.get("identity") or {} # type: ignore[assignment]
436
+ kind = classification.kind
437
+ if not prop:
438
+ # Implicit `$.identity.key` — engine returns False when no
439
+ # identity, or when the identity lacks `key`. The engine
440
+ # never synthesises one from env+identifier.
441
+ if not identity.get("key"):
442
+ return "FALSE"
443
+ value_expr = ctx.dialect.cast_string(ctx.identity_key_expr)
444
+ elif kind == "key":
445
+ if not identity.get("key"):
446
+ return "FALSE"
447
+ value_expr = ctx.dialect.cast_string(ctx.jsonpath_expr("$.identity.key"))
448
+ elif kind == "identifier":
449
+ if not identity.get("identifier"):
450
+ return "FALSE"
451
+ value_expr = ctx.dialect.cast_string(ctx.jsonpath_expr("$.identity.identifier"))
452
+ elif kind == "identity_object":
453
+ # PERCENTAGE_SPLIT on `$.identity` — the whole dict. Engine
454
+ # hashes `str(dict)`, which is a stable but useless subject;
455
+ # nobody writes this in practice. Treat as untranslatable.
456
+ return None
457
+ elif kind == "untranslatable":
458
+ # `$.identity.<X>` we don't represent in the row schema.
459
+ return None
460
+ elif kind == "static":
461
+ # Non-identity JSONPath: the engine hashes the resolved value.
462
+ # We'd need to bake it as a literal hash subject — leave for
463
+ # future work and let the caller fall back to the engine.
464
+ return None
465
+ else:
466
+ # Plain trait key, or `$.identity.traits.<X>` rewritten to
467
+ # the bare key. Hash subject pulls from `i.traits:"<key>"`
468
+ # per row.
469
+ traits = identity.get("traits") or {}
470
+ if not isinstance(traits, dict) or prop not in traits:
471
+ return "FALSE"
472
+ value_expr = ctx.dialect.cast_string(ctx.trait_path(prop))
473
+ return _percentage_split_expr(ctx, ctx.segment_key, value_expr, threshold)
474
+
475
+ if not prop:
476
+ # Non-PERCENTAGE_SPLIT condition without a property — engine looks up
477
+ # nothing, the comparator's cast fails, returns False.
478
+ return "FALSE"
479
+
480
+ if classification.kind == "trait":
481
+ return _translate_trait_op(ctx, prop, op, val)
482
+
483
+ # Non-trait classifications. We don't replicate the engine's per-row
484
+ # trait-first dispatch — it would roughly double the cost of every
485
+ # wrapped JSONPath condition. A row that happens to carry a trait
486
+ # literally named e.g. `$.identity` would shadow our resolution.
487
+ # Niche shape; the engine-parity suite xfails the one engine-test-
488
+ # data case that hits it.
489
+ if classification.kind in ("identifier", "key"):
490
+ path = ctx.jsonpath_expr(
491
+ "$.identity.identifier" if classification.kind == "identifier" else "$.identity.key"
492
+ )
493
+ if op == "IS_SET":
494
+ return "TRUE"
495
+ if op == "IS_NOT_SET":
496
+ return "FALSE"
497
+ return _comparison(ctx, op, path, val, is_jsonpath=True)
498
+ if classification.kind == "identity_object":
499
+ # `$.identity` — engine treats non-primitive lookups as "not
500
+ # set" by design; no operator meaningfully takes an object. So
501
+ # IS_SET → FALSE, IS_NOT_SET → TRUE, every scalar comparator
502
+ # fail-casts on the dict → FALSE. The SQL answer is the same
503
+ # for every row regardless of whether the eval context carries
504
+ # an identity, so we encode it directly.
505
+ return "TRUE" if op == "IS_NOT_SET" else "FALSE"
506
+ if classification.kind == "untranslatable":
507
+ # Identity-bound JSONPath we can't map to row state — caller falls
508
+ # back to the engine.
509
+ return None
510
+ # static
511
+ return _engine_static_verdict(ctx, cond)
512
+
513
+
514
+ # ---------------------------------------------------------------------------
515
+ # Rule and segment translation: Boolean composition over conditions.
516
+ # ---------------------------------------------------------------------------
517
+
518
+
519
+ def translate_rule(rule: SegmentRule, ctx: TranslateContext) -> str | None:
520
+ children: list[str] = []
521
+ for cond in rule.get("conditions") or []:
522
+ sql = translate_condition(cond, ctx)
523
+ if sql is None:
524
+ return None
525
+ children.append(f"({sql})")
526
+ for nested in rule.get("rules") or []:
527
+ sql = translate_rule(nested, ctx)
528
+ if sql is None:
529
+ return None
530
+ children.append(f"({sql})")
531
+
532
+ assert children, "segment rule must have at least one condition or nested rule"
533
+ match rule["type"]:
534
+ case "ALL":
535
+ return " AND ".join(children)
536
+ case "ANY":
537
+ return " OR ".join(children)
538
+ case "NONE":
539
+ return f"NOT ({' OR '.join(children)})"
540
+
541
+
542
+ def translate_segment(segment: SegmentContext, ctx: TranslateContext) -> str | None:
543
+ """Return a SQL `WHERE` expression for the segment.
544
+
545
+ Output shape::
546
+
547
+ SELECT ... FROM IDENTITIES i
548
+ WHERE i.environment_id = '<env-key>'
549
+ AND <returned expression>
550
+
551
+ The caller composes the surrounding query; the translator only
552
+ produces the predicate.
553
+
554
+ Returns `None` if any condition uses an untranslatable operator —
555
+ currently a REGEX pattern the active dialect's regex flavour can't
556
+ compile. Callers should fall back to
557
+ `flag_engine.is_context_in_segment` for those segments.
558
+ """
559
+ ctx = ctx.with_segment_key(segment["key"])
560
+ rules = segment.get("rules") or []
561
+ if not rules:
562
+ return "FALSE"
563
+ rule_sql: list[str] = []
564
+ for r in rules:
565
+ sql = translate_rule(r, ctx)
566
+ if sql is None:
567
+ return None
568
+ rule_sql.append(f"({sql})")
569
+ return " AND ".join(rule_sql)
@@ -0,0 +1,87 @@
1
+ """SQL escape, validation, and regex-flavour primitives, shared by
2
+ the translator and dialects.
3
+
4
+ The translator emits SQL by string composition rather than via a query-
5
+ builder. Every value originating in a `SegmentCondition` or evaluation
6
+ context must be escaped or validated before it lands in a SQL fragment;
7
+ this module is the single home for that logic.
8
+
9
+ If you find yourself f-string-interpolating a segment- or context-derived
10
+ value, route it through one of these helpers. Bypassing this layer is how
11
+ SQL injection happens; the audit trail is the call sites here.
12
+
13
+ Threat model: segment definitions come from Flagsmith users with
14
+ `MANAGE_SEGMENTS` permission on a project — trusted-but-not-fully-trusted.
15
+ A malicious operand value must not be able to escalate to arbitrary SQL
16
+ execution against the analytical store.
17
+
18
+ Functions in this module are dialect-agnostic. Anything that depends on
19
+ SQL-engine syntax — VARIANT path quoting, JSONB extraction, casts — lives
20
+ on the `Dialect` protocol instead.
21
+ """
22
+
23
+ import re
24
+
25
+
26
+ def escape_string(value: str) -> str:
27
+ """Double single quotes for inclusion inside a SQL string literal.
28
+
29
+ Use when the caller is composing a larger literal — for example a
30
+ CSV-style `IN ('a','b','c')` — and wants the un-wrapped escape. For
31
+ a single standalone value, prefer `string_literal`.
32
+ """
33
+ return value.replace("'", "''")
34
+
35
+
36
+ def string_literal(value: str) -> str:
37
+ """Wrap a value as a single-quoted SQL string literal."""
38
+ return "'" + escape_string(value) + "'"
39
+
40
+
41
+ def numeric_literal(value: object) -> str | None:
42
+ """Validate `value` is numeric and return its canonical-float string form.
43
+
44
+ Returns `None` if `value` is not parseable as a float — the caller
45
+ propagates that as "untranslatable" rather than injecting unparseable
46
+ SQL.
47
+
48
+ Booleans are rejected explicitly: `float(True) == 1.0` in Python,
49
+ but the engine treats segment-value booleans as strings via its
50
+ type-coercion path, so a numeric interpretation here would diverge.
51
+ """
52
+ if isinstance(value, bool):
53
+ return None
54
+ try:
55
+ return str(float(value)) # type: ignore[arg-type]
56
+ except (TypeError, ValueError):
57
+ return None
58
+
59
+
60
+ # Conservative check for Python-re features RE2 doesn't support.
61
+ _RE2_UNSAFE = re.compile(
62
+ r"\\\d" # backreference like \1 .. \9
63
+ r"|\(\?[=!<]" # lookahead / lookbehind / negative variants
64
+ )
65
+
66
+
67
+ def re2_safe(pattern: str) -> bool:
68
+ """Return True if `pattern` uses only features RE2 supports.
69
+
70
+ RE2 explicitly excludes backreferences and lookarounds. Use this as
71
+ the regex feature-detector in dialects whose SQL engine uses RE2 —
72
+ Snowflake, BigQuery, DuckDB, ClickHouse.
73
+ """
74
+ return _RE2_UNSAFE.search(pattern) is None
75
+
76
+
77
+ def modulo_literal(value: object) -> tuple[str, str] | None:
78
+ """Parse a `divisor|remainder` MODULO operand pair.
79
+
80
+ Returns `(divisor, remainder)` as canonical-float string forms, or
81
+ `None` if either side fails to parse.
82
+ """
83
+ try:
84
+ divisor_str, remainder_str = str(value).split("|")
85
+ return str(float(divisor_str)), str(float(remainder_str))
86
+ except (ValueError, AttributeError):
87
+ return None
@@ -0,0 +1,153 @@
1
+ Metadata-Version: 2.4
2
+ Name: flagsmith-sql-flag-engine
3
+ Version: 0.1.0
4
+ Summary: SQL translator for Flagsmith segment predicates.
5
+ Author: Flagsmith
6
+ Author-email: Flagsmith <engineering@flagsmith.com>
7
+ License-Expression: BSD-3-Clause
8
+ Classifier: Programming Language :: Python :: 3 :: Only
9
+ Classifier: Programming Language :: SQL
10
+ Classifier: Topic :: Database
11
+ Requires-Dist: flagsmith-flag-engine>=10
12
+ Requires-Dist: jsonpath-rfc9535>=0.2
13
+ Requires-Python: >=3.10
14
+ Project-URL: Homepage, https://github.com/Flagsmith/flagsmith-sql-flag-engine
15
+ Description-Content-Type: text/markdown
16
+
17
+ # flagsmith-sql-flag-engine
18
+
19
+ SQL translator for Flagsmith segment predicates.
20
+
21
+ Where the Python and Rust `flag_engine` implementations evaluate
22
+ `is_context_in_segment` against an in-memory `EvaluationContext`, this
23
+ package takes a `SegmentContext` and emits a SQL `WHERE` expression that
24
+ evaluates the segment against an entire `IDENTITIES` table — one row per
25
+ identity, with the identity's full trait map held in a single column
26
+ the translator path-extracts at query time. `PERCENTAGE_SPLIT` and
27
+ `:semver`-marked comparators compile to inline pure-SQL.
28
+
29
+ ## Quickstart
30
+
31
+ ```python
32
+ from flag_engine.context.types import EvaluationContext, SegmentContext
33
+
34
+ from flagsmith_sql_flag_engine import TranslateContext, translate_segment
35
+ from flagsmith_sql_flag_engine.dialects import ClickHouseDialect
36
+
37
+ eval_context: EvaluationContext = {
38
+ "environment": {"key": "n9fbf9...3ngWhb", "name": "Production"},
39
+ }
40
+ ctx = TranslateContext(evaluation_context=eval_context, dialect=ClickHouseDialect())
41
+
42
+ segment: SegmentContext = {
43
+ "key": "growth-cohort",
44
+ "name": "Growth cohort",
45
+ "rules": [
46
+ {
47
+ "type": "ALL",
48
+ "conditions": [
49
+ {"operator": "EQUAL", "property": "plan", "value": "growth"},
50
+ ],
51
+ },
52
+ ],
53
+ }
54
+ where_expr = translate_segment(segment, ctx)
55
+ # where_expr is a SQL string. Drop into:
56
+ # SELECT COUNT(*) FROM IDENTITIES i
57
+ # WHERE i.environment_id = 'n9fbf9...3ngWhb' AND ({where_expr})
58
+ ```
59
+
60
+ `environment_id` in the `IDENTITIES` table is a string column holding
61
+ `EnvironmentContext.key` directly — the same identifier the engine uses,
62
+ no separate integer PK.
63
+
64
+ `translate_segment` returns `None` if the segment uses an operator the
65
+ translator can't handle — typically a REGEX pattern the active dialect's
66
+ regex flavour can't compile. Callers should fall back to
67
+ `flag_engine.is_context_in_segment` for those segments.
68
+
69
+ ## Schema
70
+
71
+ Each dialect publishes the table layout it expects via a `schema_ddl`
72
+ constant. For ClickHouse:
73
+
74
+ ```sql
75
+ CREATE TABLE IF NOT EXISTS IDENTITIES (
76
+ environment_id String,
77
+ id UInt64,
78
+ identifier String,
79
+ identity_key String,
80
+ traits JSON
81
+ )
82
+ ENGINE = MergeTree()
83
+ ORDER BY (environment_id, id);
84
+ ```
85
+
86
+ Traits live in a single `JSON` column (CH 24+, GA in 25.x). Each key is
87
+ stored as a typed subcolumn, so trait reads are direct columnar scans
88
+ rather than per-row JSON parses. Trait keys are *data* — new keys appear
89
+ without schema changes — and the translator only sees the abstract path
90
+ extraction.
91
+
92
+ ClickHouse Cloud requires `SET allow_experimental_json_type = 1` when
93
+ creating a `JSON`-column table (the type is GA on OSS 25.x); the test
94
+ harness applies this setting automatically.
95
+
96
+ Programmatic access:
97
+
98
+ ```python
99
+ from flagsmith_sql_flag_engine.dialects.clickhouse import SCHEMA_DDL
100
+ ```
101
+
102
+ ## Engine parity
103
+
104
+ Validated against [Flagsmith/engine-test-data](https://github.com/Flagsmith/engine-test-data),
105
+ the test suite every engine implementation is checked against. The
106
+ engine-parity suite loads each test case's identity into a per-dialect
107
+ scratch table, translates the case's segments, runs the generated SQL,
108
+ and compares to `flag_engine.is_context_in_segment`.
109
+
110
+ To run the engine-parity suite locally:
111
+
112
+ ```bash
113
+ git submodule update --init # pull engine-test-data
114
+ docker compose up --detach --wait clickhouse
115
+ uv run pytest tests/test_engine.py
116
+ ```
117
+
118
+ Adding a new dialect's parity coverage is one harness module — see
119
+ `tests/harnesses/` for the shape.
120
+
121
+ ## Dialects
122
+
123
+ The translator is dialect-aware: a `Dialect` protocol abstracts the
124
+ SQL fragments that differ across SQL engines — MD5 hex, hex-to-int
125
+ parsing, prefix-anchored regex, padded-version comparison, type-aware
126
+ trait predicates, regex flavour. Today `ClickHouseDialect` is the only
127
+ implementation; adding another engine such as Snowflake, DuckDB or
128
+ Postgres means writing one class.
129
+
130
+ ## Operator coverage
131
+
132
+ | Operator | Translatable | Notes |
133
+ | -------------------------------------------- | :----------: | -------------------------------------------------------------- |
134
+ | `EQUAL`, `NOT_EQUAL`, `IN` | yes | |
135
+ | `IS_SET`, `IS_NOT_SET` | yes | trait subcolumn `IS NOT NULL` / `IS NULL` |
136
+ | `CONTAINS`, `NOT_CONTAINS` | yes | |
137
+ | `GREATER_THAN`, `LESS_THAN` plus `_INCLUSIVE`| yes | |
138
+ | `MODULO` | yes | |
139
+ | `PERCENTAGE_SPLIT` | yes | inlined MD5-mod-9999; ~0.005% diverge on hash==9998 |
140
+ | `REGEX` | partial | dialect-flavour gated; unsupported patterns → caller fallback |
141
+ | `:semver`-marked comparators | yes | major.minor.patch only; ignores prerelease |
142
+
143
+ ## Development
144
+
145
+ ```bash
146
+ make install # uv sync + pre-commit install
147
+ make lint # run pre-commit hooks across the tree
148
+ make typecheck # mypy
149
+ make test # unit tests
150
+ ```
151
+
152
+ Ruff (lint + format) runs as a pre-commit hook on every commit. Mypy
153
+ runs as a `make typecheck` hook on staged Python files.
@@ -0,0 +1,10 @@
1
+ flagsmith_sql_flag_engine/__init__.py,sha256=DgiUBg8KnfdisZiMNsWpzKYtDqJoLvxnKctXQGESdZg,710
2
+ flagsmith_sql_flag_engine/dialect.py,sha256=G4rzXszXUVM8iV-r0w_YJgFAWtqWx1hfJI48P9zuTmE,4514
3
+ flagsmith_sql_flag_engine/dialects/__init__.py,sha256=4pkblw-Jr04CLnSbpMud5Vyj5alBgB6WhSm0Cz9OWSI,141
4
+ flagsmith_sql_flag_engine/dialects/clickhouse.py,sha256=Flwy2QNEv-A53rvZoaWv8Q0wytuO8NY4IaRogGr0Ztg,15523
5
+ flagsmith_sql_flag_engine/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ flagsmith_sql_flag_engine/translator.py,sha256=zRWbq_ZMP1VgpQDEMdlN91u4XaUVXC9zSExkIxEraiM,22952
7
+ flagsmith_sql_flag_engine/utils.py,sha256=ygP8cijp1jQjszFj87odK95jdfV3DwaT02eXrQdikAA,3202
8
+ flagsmith_sql_flag_engine-0.1.0.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
9
+ flagsmith_sql_flag_engine-0.1.0.dist-info/METADATA,sha256=g8J3yDLDPgyV-cjsaDD3Y3u424M5_HVc17NRYo45Eso,6137
10
+ flagsmith_sql_flag_engine-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.8.24
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any