tellaro-query-language 0.2.3__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tellaro_query_language-0.2.6.dist-info/LICENSE +72 -0
- tellaro_query_language-0.2.6.dist-info/METADATA +806 -0
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/RECORD +23 -20
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/entry_points.txt +1 -0
- tql/cache/base.py +36 -2
- tql/cache/memory.py +53 -6
- tql/cache/redis.py +52 -11
- tql/cli.py +484 -0
- tql/core.py +244 -5
- tql/evaluator.py +1 -1
- tql/evaluator_components/special_expressions.py +62 -10
- tql/evaluator_components/value_comparison.py +0 -4
- tql/exceptions.py +6 -4
- tql/field_type_inference.py +285 -0
- tql/mutators/geo.py +57 -20
- tql/opensearch_components/query_converter.py +1 -1
- tql/opensearch_stats.py +7 -6
- tql/parser.py +7 -3
- tql/post_processor.py +8 -4
- tql/scripts.py +3 -3
- tql/stats_evaluator.py +357 -5
- tql/streaming_file_processor.py +335 -0
- tellaro_query_language-0.2.3.dist-info/LICENSE +0 -21
- tellaro_query_language-0.2.3.dist-info/METADATA +0 -433
- {tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""Field type inference for CSV and other structured data files.
|
|
2
|
+
|
|
3
|
+
This module provides utilities to automatically detect field types from sample data,
|
|
4
|
+
supporting various data formats including numbers, booleans, dates, and strings.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FieldTypeInferencer:
|
|
13
|
+
"""Infers field types from sample data records."""
|
|
14
|
+
|
|
15
|
+
# Common date/timestamp patterns
|
|
16
|
+
DATE_PATTERNS = [
|
|
17
|
+
(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", "%Y-%m-%dT%H:%M:%S"), # ISO 8601
|
|
18
|
+
(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", "%Y-%m-%d %H:%M:%S"), # SQL datetime
|
|
19
|
+
(r"^\d{4}-\d{2}-\d{2}", "%Y-%m-%d"), # Date only
|
|
20
|
+
(r"^\d{2}/\d{2}/\d{4}", "%m/%d/%Y"), # US date
|
|
21
|
+
(r"^\d{2}-\d{2}-\d{4}", "%d-%m-%Y"), # EU date
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# Boolean value mappings
|
|
25
|
+
BOOLEAN_VALUES = {
|
|
26
|
+
"true": True,
|
|
27
|
+
"false": False,
|
|
28
|
+
"yes": True,
|
|
29
|
+
"no": False,
|
|
30
|
+
"t": True,
|
|
31
|
+
"f": False,
|
|
32
|
+
"y": True,
|
|
33
|
+
"n": False,
|
|
34
|
+
"1": True,
|
|
35
|
+
"0": False,
|
|
36
|
+
"on": True,
|
|
37
|
+
"off": False,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
def __init__(self, sample_size: int = 100):
|
|
41
|
+
"""Initialize the inferencer.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
sample_size: Number of records to sample for type inference
|
|
45
|
+
"""
|
|
46
|
+
self.sample_size = sample_size
|
|
47
|
+
|
|
48
|
+
def infer_from_records(
|
|
49
|
+
self, records: List[Dict[str, Any]], field_overrides: Optional[Dict[str, str]] = None
|
|
50
|
+
) -> Dict[str, str]:
|
|
51
|
+
"""Infer field types from a list of records.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
records: List of dictionaries representing records
|
|
55
|
+
field_overrides: Optional manual field type overrides
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Dictionary mapping field names to inferred types
|
|
59
|
+
"""
|
|
60
|
+
if not records:
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
# Sample records if we have more than sample_size
|
|
64
|
+
sample = records[: self.sample_size] if len(records) > self.sample_size else records
|
|
65
|
+
|
|
66
|
+
# Collect all field names
|
|
67
|
+
all_fields: set[str] = set()
|
|
68
|
+
for record in sample:
|
|
69
|
+
all_fields.update(record.keys())
|
|
70
|
+
|
|
71
|
+
# Infer type for each field
|
|
72
|
+
field_types = {}
|
|
73
|
+
for field in all_fields:
|
|
74
|
+
# Check for manual override first
|
|
75
|
+
if field_overrides and field in field_overrides:
|
|
76
|
+
field_types[field] = field_overrides[field]
|
|
77
|
+
else:
|
|
78
|
+
field_types[field] = self._infer_field_type(field, sample)
|
|
79
|
+
|
|
80
|
+
return field_types
|
|
81
|
+
|
|
82
|
+
def _infer_field_type(self, field: str, records: List[Dict[str, Any]]) -> str: # noqa: C901
|
|
83
|
+
"""Infer the type of a single field from sample records.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
field: Field name
|
|
87
|
+
records: Sample records
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Inferred type string ('integer', 'float', 'boolean', 'date', 'string')
|
|
91
|
+
"""
|
|
92
|
+
# Collect non-null values
|
|
93
|
+
values = []
|
|
94
|
+
for record in records:
|
|
95
|
+
if field in record and record[field] is not None and record[field] != "":
|
|
96
|
+
values.append(record[field])
|
|
97
|
+
|
|
98
|
+
if not values:
|
|
99
|
+
return "string" # Default for empty fields
|
|
100
|
+
|
|
101
|
+
# If values are already typed (from JSON), use those types
|
|
102
|
+
if all(isinstance(v, bool) for v in values):
|
|
103
|
+
return "boolean"
|
|
104
|
+
if all(isinstance(v, int) for v in values):
|
|
105
|
+
return "integer"
|
|
106
|
+
if all(isinstance(v, (int, float)) for v in values):
|
|
107
|
+
return "float"
|
|
108
|
+
|
|
109
|
+
# For string values, try to infer more specific types
|
|
110
|
+
string_values = [str(v) for v in values]
|
|
111
|
+
|
|
112
|
+
# Try boolean detection
|
|
113
|
+
if self._is_boolean_field(string_values):
|
|
114
|
+
return "boolean"
|
|
115
|
+
|
|
116
|
+
# Try integer detection
|
|
117
|
+
if self._is_integer_field(string_values):
|
|
118
|
+
return "integer"
|
|
119
|
+
|
|
120
|
+
# Try float detection
|
|
121
|
+
if self._is_float_field(string_values):
|
|
122
|
+
return "float"
|
|
123
|
+
|
|
124
|
+
# Try date detection
|
|
125
|
+
if self._is_date_field(string_values):
|
|
126
|
+
return "date"
|
|
127
|
+
|
|
128
|
+
# Default to string
|
|
129
|
+
return "string"
|
|
130
|
+
|
|
131
|
+
def _is_boolean_field(self, values: List[str]) -> bool:
|
|
132
|
+
"""Check if values represent boolean data."""
|
|
133
|
+
# At least 80% of values should be recognizable boolean values
|
|
134
|
+
boolean_count = sum(1 for v in values if v.lower() in self.BOOLEAN_VALUES)
|
|
135
|
+
return boolean_count / len(values) >= 0.8
|
|
136
|
+
|
|
137
|
+
def _is_integer_field(self, values: List[str]) -> bool:
|
|
138
|
+
"""Check if values represent integer data."""
|
|
139
|
+
try:
|
|
140
|
+
for v in values:
|
|
141
|
+
int(v)
|
|
142
|
+
return True
|
|
143
|
+
except (ValueError, TypeError):
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
def _is_float_field(self, values: List[str]) -> bool:
|
|
147
|
+
"""Check if values represent floating point data."""
|
|
148
|
+
try:
|
|
149
|
+
for v in values:
|
|
150
|
+
float(v)
|
|
151
|
+
return True
|
|
152
|
+
except (ValueError, TypeError):
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
def _is_date_field(self, values: List[str]) -> bool:
|
|
156
|
+
"""Check if values represent date/timestamp data."""
|
|
157
|
+
# Try each date pattern
|
|
158
|
+
match_counts = []
|
|
159
|
+
for pattern, _ in self.DATE_PATTERNS:
|
|
160
|
+
matches = sum(1 for v in values if re.match(pattern, v))
|
|
161
|
+
match_counts.append(matches)
|
|
162
|
+
|
|
163
|
+
# If any pattern matches at least 80% of values, consider it a date field
|
|
164
|
+
best_match_rate = max(match_counts) / len(values) if match_counts else 0
|
|
165
|
+
return best_match_rate >= 0.8
|
|
166
|
+
|
|
167
|
+
def detect_csv_headers(self, first_row: List[str], second_row: List[str]) -> bool:
|
|
168
|
+
"""Detect if the first row of a CSV is a header row.
|
|
169
|
+
|
|
170
|
+
Uses heuristics to determine if first row looks like column names.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
first_row: First row values
|
|
174
|
+
second_row: Second row values
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
True if first row appears to be headers
|
|
178
|
+
"""
|
|
179
|
+
if not first_row or not second_row:
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
# Heuristic 1: First row all strings, second row has numbers
|
|
183
|
+
first_row_all_alpha = all(self._is_mostly_alpha(val) for val in first_row)
|
|
184
|
+
second_row_has_numbers = any(self._is_numeric(val) for val in second_row)
|
|
185
|
+
|
|
186
|
+
if first_row_all_alpha and second_row_has_numbers:
|
|
187
|
+
return True
|
|
188
|
+
|
|
189
|
+
# Heuristic 2: First row has no duplicates (headers should be unique)
|
|
190
|
+
if len(first_row) != len(set(first_row)):
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
# Heuristic 3: First row values look like identifiers (snake_case, camelCase, etc.)
|
|
194
|
+
identifier_pattern = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
|
|
195
|
+
identifier_count = sum(1 for val in first_row if identifier_pattern.match(val))
|
|
196
|
+
if identifier_count / len(first_row) >= 0.7:
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
# Heuristic 4: Second row has more varied types than first row
|
|
200
|
+
first_row_types = self._get_value_types(first_row)
|
|
201
|
+
second_row_types = self._get_value_types(second_row)
|
|
202
|
+
|
|
203
|
+
if len(second_row_types) > len(first_row_types):
|
|
204
|
+
return True
|
|
205
|
+
|
|
206
|
+
# Default to no header if inconclusive
|
|
207
|
+
return False
|
|
208
|
+
|
|
209
|
+
def _is_mostly_alpha(self, value: str) -> bool:
|
|
210
|
+
"""Check if value is mostly alphabetic (for header detection)."""
|
|
211
|
+
if not value:
|
|
212
|
+
return False
|
|
213
|
+
alpha_count = sum(1 for c in value if c.isalpha() or c in "_- ")
|
|
214
|
+
return alpha_count / len(value) >= 0.5
|
|
215
|
+
|
|
216
|
+
def _is_numeric(self, value: str) -> bool:
|
|
217
|
+
"""Check if value is numeric."""
|
|
218
|
+
try:
|
|
219
|
+
float(value)
|
|
220
|
+
return True
|
|
221
|
+
except (ValueError, TypeError):
|
|
222
|
+
return False
|
|
223
|
+
|
|
224
|
+
def _get_value_types(self, values: List[str]) -> set:
|
|
225
|
+
"""Get set of value types in a list."""
|
|
226
|
+
types = set()
|
|
227
|
+
for val in values:
|
|
228
|
+
if self._is_numeric(val):
|
|
229
|
+
types.add("numeric")
|
|
230
|
+
elif val.lower() in self.BOOLEAN_VALUES:
|
|
231
|
+
types.add("boolean")
|
|
232
|
+
elif any(re.match(pattern, val) for pattern, _ in self.DATE_PATTERNS):
|
|
233
|
+
types.add("date")
|
|
234
|
+
else:
|
|
235
|
+
types.add("string")
|
|
236
|
+
return types
|
|
237
|
+
|
|
238
|
+
def convert_value(self, value: Any, field_type: str) -> Any:
|
|
239
|
+
"""Convert a value to its inferred type.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
value: Raw value (usually string from CSV)
|
|
243
|
+
field_type: Target type
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Converted value
|
|
247
|
+
"""
|
|
248
|
+
if value is None or value == "":
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
if field_type == "integer":
|
|
253
|
+
return int(value)
|
|
254
|
+
elif field_type == "float":
|
|
255
|
+
return float(value)
|
|
256
|
+
elif field_type == "boolean":
|
|
257
|
+
str_val = str(value).lower()
|
|
258
|
+
return self.BOOLEAN_VALUES.get(str_val, bool(value))
|
|
259
|
+
elif field_type == "date":
|
|
260
|
+
# Try to parse as datetime
|
|
261
|
+
return self._parse_date(str(value))
|
|
262
|
+
else:
|
|
263
|
+
return str(value)
|
|
264
|
+
except (ValueError, TypeError):
|
|
265
|
+
# If conversion fails, return as string
|
|
266
|
+
return str(value)
|
|
267
|
+
|
|
268
|
+
def _parse_date(self, value: str) -> Optional[str]:
|
|
269
|
+
"""Parse date string and return in ISO format.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
value: Date string
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
ISO formatted date string or original if parsing fails
|
|
276
|
+
"""
|
|
277
|
+
for pattern, date_format in self.DATE_PATTERNS:
|
|
278
|
+
if re.match(pattern, value):
|
|
279
|
+
try:
|
|
280
|
+
dt = datetime.strptime(value, date_format)
|
|
281
|
+
return dt.isoformat()
|
|
282
|
+
except ValueError:
|
|
283
|
+
continue
|
|
284
|
+
# Return original if no pattern matches
|
|
285
|
+
return value
|
tql/mutators/geo.py
CHANGED
|
@@ -31,11 +31,22 @@ class GeoIPResolver:
|
|
|
31
31
|
self.mmdb_readers = self._load_mmdb_files()
|
|
32
32
|
|
|
33
33
|
def _load_mmdb_files(self) -> Dict[str, Any]: # noqa: C901
|
|
34
|
-
"""Load MMDB files with smart detection.
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
"""Load MMDB files with smart detection.
|
|
35
|
+
|
|
36
|
+
Supported environment variables (in priority order):
|
|
37
|
+
- TQL_GEOIP_DB_PATH: Full path to combined database (City, Country, ASN all-in-one)
|
|
38
|
+
- TQL_GEOIP_DB_CITY_PATH: Path to City database
|
|
39
|
+
- TQL_GEOIP_DB_COUNTRY_PATH: Path to Country database
|
|
40
|
+
- TQL_GEOIP_DB_ASN_PATH: Path to ASN database
|
|
41
|
+
- TQL_GEOIP_MMDB_PATH: Base directory for auto-detection (default: /usr/share/geoip)
|
|
42
|
+
"""
|
|
43
|
+
# Check for explicit full DB path first (from config or environment variable)
|
|
44
|
+
db_path = self.config.get("db_path") or os.environ.get("TQL_GEOIP_DB_PATH")
|
|
45
|
+
if db_path:
|
|
46
|
+
# Expand user home directory (~)
|
|
47
|
+
db_path = os.path.expanduser(db_path)
|
|
48
|
+
if db_path and os.path.exists(db_path):
|
|
37
49
|
# Detect DB type from filename
|
|
38
|
-
db_path = self.config["db_path"]
|
|
39
50
|
db_lower = db_path.lower()
|
|
40
51
|
if "dbip" in db_lower or "db-ip" in db_lower:
|
|
41
52
|
self.db_type = "dbip"
|
|
@@ -53,6 +64,7 @@ class GeoIPResolver:
|
|
|
53
64
|
|
|
54
65
|
# Check base path for auto-detection
|
|
55
66
|
base_path = self.config.get("base_path", os.environ.get("TQL_GEOIP_MMDB_PATH", "/usr/share/geoip"))
|
|
67
|
+
base_path = os.path.expanduser(base_path)
|
|
56
68
|
|
|
57
69
|
# Priority order for database detection
|
|
58
70
|
db_patterns: List[Dict[str, Any]] = [
|
|
@@ -100,11 +112,19 @@ class GeoIPResolver:
|
|
|
100
112
|
self.mmdb_type = pattern["mmdb_type"]
|
|
101
113
|
return {"full": maxminddb.open_database(path)}
|
|
102
114
|
else:
|
|
103
|
-
# Multiple files needed
|
|
115
|
+
# Multiple files needed - check config, environment variables, then base_path
|
|
104
116
|
readers = {}
|
|
105
117
|
all_found = True
|
|
106
118
|
for db_type, filename in pattern["files"].items():
|
|
107
|
-
|
|
119
|
+
# Priority: config > environment variable > base_path/filename
|
|
120
|
+
env_var_name = f"TQL_GEOIP_DB_{db_type.upper()}_PATH"
|
|
121
|
+
path = (
|
|
122
|
+
self.config.get(f"{db_type}_db")
|
|
123
|
+
or os.environ.get(env_var_name)
|
|
124
|
+
or os.path.join(base_path, filename)
|
|
125
|
+
)
|
|
126
|
+
# Expand user home directory (~)
|
|
127
|
+
path = os.path.expanduser(path)
|
|
108
128
|
if os.path.exists(path):
|
|
109
129
|
readers[db_type] = maxminddb.open_database(path)
|
|
110
130
|
else:
|
|
@@ -176,6 +196,7 @@ class GeoIPLookupMutator(BaseMutator):
|
|
|
176
196
|
# Class-level cache and resolver
|
|
177
197
|
_cache_manager: Optional[CacheManager] = None
|
|
178
198
|
_geo_resolver: Optional[GeoIPResolver] = None
|
|
199
|
+
_geo_resolvers: Dict[str, GeoIPResolver] = {}
|
|
179
200
|
|
|
180
201
|
def __init__(self, params: Optional[Dict[str, Any]] = None) -> None:
|
|
181
202
|
super().__init__(params)
|
|
@@ -223,10 +244,27 @@ class GeoIPLookupMutator(BaseMutator):
|
|
|
223
244
|
|
|
224
245
|
@classmethod
|
|
225
246
|
def get_geo_resolver(cls, config: Optional[Dict[str, str]] = None) -> GeoIPResolver:
|
|
226
|
-
"""Get or create the GeoIP resolver.
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
247
|
+
"""Get or create the GeoIP resolver.
|
|
248
|
+
|
|
249
|
+
Note: Resolver is cached at class level per unique configuration.
|
|
250
|
+
Config with explicit paths will be cached separately from environment variable configs.
|
|
251
|
+
Environment variable changes require process restart to take effect.
|
|
252
|
+
"""
|
|
253
|
+
# Create cache key based on config
|
|
254
|
+
if config is None:
|
|
255
|
+
cache_key = "env_vars" # Uses environment variables
|
|
256
|
+
else:
|
|
257
|
+
# Use config values as cache key
|
|
258
|
+
cache_key = str(sorted(config.items()))
|
|
259
|
+
|
|
260
|
+
# Check if we have a cached resolver for this config
|
|
261
|
+
if not hasattr(cls, "_geo_resolvers"):
|
|
262
|
+
cls._geo_resolvers = {}
|
|
263
|
+
|
|
264
|
+
if cache_key not in cls._geo_resolvers:
|
|
265
|
+
cls._geo_resolvers[cache_key] = GeoIPResolver(config)
|
|
266
|
+
|
|
267
|
+
return cls._geo_resolvers[cache_key]
|
|
230
268
|
|
|
231
269
|
def _get_field_value(self, record: Dict[str, Any], field_path: str) -> Any:
|
|
232
270
|
"""Get a field value from a record, supporting nested fields."""
|
|
@@ -274,7 +312,7 @@ class GeoIPLookupMutator(BaseMutator):
|
|
|
274
312
|
existing_geo_data = None
|
|
275
313
|
existing_as_data = None
|
|
276
314
|
|
|
277
|
-
# Check if geo data already exists in the record
|
|
315
|
+
# Check if geo data already exists in the record (ECS style)
|
|
278
316
|
if "." in field_name:
|
|
279
317
|
# For nested fields like destination.ip, check destination.geo and destination.as
|
|
280
318
|
parent_path = field_name.rsplit(".", 1)[0]
|
|
@@ -283,10 +321,9 @@ class GeoIPLookupMutator(BaseMutator):
|
|
|
283
321
|
existing_geo_data = parent.get("geo")
|
|
284
322
|
existing_as_data = parent.get("as")
|
|
285
323
|
else:
|
|
286
|
-
# For top-level fields, check
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
existing_as_data = record["enrichment"].get("as")
|
|
324
|
+
# For top-level fields like ip, check top-level geo and as fields (ECS style)
|
|
325
|
+
existing_geo_data = record.get("geo")
|
|
326
|
+
existing_as_data = record.get("as")
|
|
290
327
|
|
|
291
328
|
# If not forcing and geo data exists with at least country_iso_code, return existing
|
|
292
329
|
if (
|
|
@@ -318,12 +355,12 @@ class GeoIPLookupMutator(BaseMutator):
|
|
|
318
355
|
|
|
319
356
|
# Get GeoIP resolver and perform lookup
|
|
320
357
|
try:
|
|
321
|
-
|
|
358
|
+
# Build config only if we have explicit parameters
|
|
359
|
+
# Otherwise pass None to let GeoIPResolver read environment variables
|
|
360
|
+
geo_config = None
|
|
322
361
|
if self.params.get("db_path"):
|
|
323
|
-
geo_config
|
|
324
|
-
|
|
325
|
-
# Use environment variable for direct database path
|
|
326
|
-
geo_config["db_path"] = os.getenv("TQL_GEOIP_FULL_PATH")
|
|
362
|
+
geo_config = {"db_path": self.params["db_path"]}
|
|
363
|
+
|
|
327
364
|
geo_resolver = self.get_geo_resolver(geo_config)
|
|
328
365
|
|
|
329
366
|
# Perform lookup
|
|
@@ -22,7 +22,7 @@ class QueryConverter:
|
|
|
22
22
|
self.intelligent_mappings = field_mappings
|
|
23
23
|
self.simple_mappings = simple_mappings
|
|
24
24
|
|
|
25
|
-
def convert_node(self, node: Any) -> Dict[str, Any]:
|
|
25
|
+
def convert_node(self, node: Any) -> Dict[str, Any]: # noqa: C901
|
|
26
26
|
"""Convert a single AST node to OpenSearch query fragment."""
|
|
27
27
|
if isinstance(node, dict):
|
|
28
28
|
node_type = node.get("type")
|
tql/opensearch_stats.py
CHANGED
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
This module translates TQL stats queries to OpenSearch aggregation DSL.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
from typing import Any, Dict, List, Optional, Union
|
|
7
6
|
import json
|
|
7
|
+
from typing import Any, Dict, List, Optional, Union
|
|
8
8
|
|
|
9
9
|
from .exceptions import TQLError
|
|
10
10
|
|
|
@@ -86,7 +86,9 @@ class OpenSearchStatsTranslator:
|
|
|
86
86
|
# Build nested terms aggregations for grouping
|
|
87
87
|
aggs_dsl = self._build_grouped_aggregations(aggregations, group_by_fields, field_mappings)
|
|
88
88
|
print(
|
|
89
|
-
f"\n=== OpenSearch Aggregation Query ===\
|
|
89
|
+
f"\n=== OpenSearch Aggregation Query ===\n"
|
|
90
|
+
f"Group by: {group_by_fields}\n"
|
|
91
|
+
f"Aggregation DSL: {json.dumps(aggs_dsl, indent=2)}\n"
|
|
90
92
|
)
|
|
91
93
|
else:
|
|
92
94
|
# Simple aggregations without grouping
|
|
@@ -186,7 +188,6 @@ class OpenSearchStatsTranslator:
|
|
|
186
188
|
# Check for top/bottom modifiers
|
|
187
189
|
order_field = None
|
|
188
190
|
order_direction = "desc"
|
|
189
|
-
size = 10
|
|
190
191
|
|
|
191
192
|
for agg in aggregations:
|
|
192
193
|
if "modifier" in agg:
|
|
@@ -194,7 +195,7 @@ class OpenSearchStatsTranslator:
|
|
|
194
195
|
alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_0"
|
|
195
196
|
order_field = alias
|
|
196
197
|
order_direction = "desc" if agg["modifier"] == "top" else "asc"
|
|
197
|
-
|
|
198
|
+
_size = agg.get("limit", 10) # noqa: F841
|
|
198
199
|
break
|
|
199
200
|
|
|
200
201
|
# Normalize group_by_fields to handle both old (string) and new (dict) formats
|
|
@@ -215,7 +216,7 @@ class OpenSearchStatsTranslator:
|
|
|
215
216
|
current_aggs = inner_aggs
|
|
216
217
|
|
|
217
218
|
# Process group_by fields in reverse order to build proper nesting
|
|
218
|
-
for
|
|
219
|
+
for _i, field_spec in enumerate(reversed(normalized_fields)):
|
|
219
220
|
field_name = field_spec["field"]
|
|
220
221
|
bucket_size = field_spec["bucket_size"]
|
|
221
222
|
|
|
@@ -443,7 +444,7 @@ class OpenSearchStatsTranslator:
|
|
|
443
444
|
|
|
444
445
|
return result
|
|
445
446
|
|
|
446
|
-
def _transform_bucket_recursive(
|
|
447
|
+
def _transform_bucket_recursive( # noqa: C901
|
|
447
448
|
self,
|
|
448
449
|
bucket: Dict[str, Any],
|
|
449
450
|
aggregations: List[Dict[str, Any]],
|
tql/parser.py
CHANGED
|
@@ -138,7 +138,7 @@ class TQLParser:
|
|
|
138
138
|
"Please simplify your query to reduce nesting.",
|
|
139
139
|
position=0,
|
|
140
140
|
query="",
|
|
141
|
-
suggestions=["Reduce query nesting depth", "Split into multiple simpler queries"]
|
|
141
|
+
suggestions=["Reduce query nesting depth", "Split into multiple simpler queries"],
|
|
142
142
|
)
|
|
143
143
|
|
|
144
144
|
if isinstance(parsed, list):
|
|
@@ -290,7 +290,11 @@ class TQLParser:
|
|
|
290
290
|
}
|
|
291
291
|
else:
|
|
292
292
|
# Fallback to treating as unary logical operator
|
|
293
|
-
return {
|
|
293
|
+
return {
|
|
294
|
+
"type": "unary_op",
|
|
295
|
+
"operator": first.lower(),
|
|
296
|
+
"operand": self._build_ast(second, depth + 1),
|
|
297
|
+
}
|
|
294
298
|
elif len(parsed) >= 3:
|
|
295
299
|
# Check if this is a field with multiple mutators
|
|
296
300
|
if isinstance(parsed[0], str) and all(
|
|
@@ -1239,7 +1243,7 @@ class TQLParser:
|
|
|
1239
1243
|
"Please simplify your query to reduce nesting.",
|
|
1240
1244
|
position=0,
|
|
1241
1245
|
query="",
|
|
1242
|
-
suggestions=["Reduce query nesting depth", "Split into multiple simpler queries"]
|
|
1246
|
+
suggestions=["Reduce query nesting depth", "Split into multiple simpler queries"],
|
|
1243
1247
|
)
|
|
1244
1248
|
if len(parsed_list) < 3:
|
|
1245
1249
|
# Not enough elements for a chained operation
|
tql/post_processor.py
CHANGED
|
@@ -137,7 +137,9 @@ class QueryPostProcessor:
|
|
|
137
137
|
# This is a special case: field | any/all/none eq value
|
|
138
138
|
# Safe access - both keys are guaranteed to exist by the if checks
|
|
139
139
|
array_operator = requirement.metadata["operator"] # exists from line 128 check
|
|
140
|
-
comparison_operator = requirement.metadata[
|
|
140
|
+
comparison_operator = requirement.metadata[
|
|
141
|
+
"comparison_operator"
|
|
142
|
+
] # exists from line 135 check
|
|
141
143
|
value = requirement.metadata.get("value")
|
|
142
144
|
|
|
143
145
|
# Get the field value with proper nested field handling
|
|
@@ -1015,10 +1017,10 @@ class QueryPostProcessor:
|
|
|
1015
1017
|
- "user.address.zip" -> "user.address.__zip_mutated__"
|
|
1016
1018
|
- "status" -> "__status_mutated__"
|
|
1017
1019
|
"""
|
|
1018
|
-
field_parts = field_name.split(
|
|
1020
|
+
field_parts = field_name.split(".")
|
|
1019
1021
|
if len(field_parts) > 1:
|
|
1020
1022
|
# For nested fields, only mutate the leaf field name
|
|
1021
|
-
return
|
|
1023
|
+
return ".".join(field_parts[:-1] + [f"__{field_parts[-1]}_mutated__"])
|
|
1022
1024
|
else:
|
|
1023
1025
|
# For flat fields, mutate the entire name
|
|
1024
1026
|
return f"__{field_name}_mutated__"
|
|
@@ -1169,7 +1171,9 @@ class PostProcessingStats:
|
|
|
1169
1171
|
class PostProcessingError(Exception):
|
|
1170
1172
|
"""Exception raised during post-processing operations."""
|
|
1171
1173
|
|
|
1172
|
-
def __init__(
|
|
1174
|
+
def __init__( # noqa: B042
|
|
1175
|
+
self, message: str, field_name: Optional[str] = None, mutator_name: Optional[str] = None
|
|
1176
|
+
):
|
|
1173
1177
|
"""Initialize post-processing error.
|
|
1174
1178
|
|
|
1175
1179
|
Args:
|
tql/scripts.py
CHANGED
|
@@ -34,7 +34,7 @@ def run_coverage():
|
|
|
34
34
|
env = os.environ.copy()
|
|
35
35
|
if "INTEGRATION_TEST_ENABLE" not in env:
|
|
36
36
|
env["INTEGRATION_TEST_ENABLE"] = "false"
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
# 1. Run pytest with coverage, using `src` as the source
|
|
39
39
|
subprocess.run(["coverage", "run", "--source=src", "-m", "pytest"], check=True, env=env) # nosec
|
|
40
40
|
|
|
@@ -50,7 +50,7 @@ def run_tests():
|
|
|
50
50
|
env = os.environ.copy()
|
|
51
51
|
if "INTEGRATION_TEST_ENABLE" not in env:
|
|
52
52
|
env["INTEGRATION_TEST_ENABLE"] = "false"
|
|
53
|
-
|
|
53
|
+
|
|
54
54
|
subprocess.run(["pytest", "tests"], check=True, env=env) # nosec
|
|
55
55
|
|
|
56
56
|
|
|
@@ -99,7 +99,7 @@ def run_badge():
|
|
|
99
99
|
env = os.environ.copy()
|
|
100
100
|
if "INTEGRATION_TEST_ENABLE" not in env:
|
|
101
101
|
env["INTEGRATION_TEST_ENABLE"] = "false"
|
|
102
|
-
|
|
102
|
+
|
|
103
103
|
subprocess.run( # nosec
|
|
104
104
|
[
|
|
105
105
|
"coverage",
|