PyPI - tellaro-query-language - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

tellaro-query-language 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/METADATA +24 -1
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/RECORD +27 -27
tql/core.py +225 -54
tql/core_components/opensearch_operations.py +415 -99
tql/core_components/stats_operations.py +11 -1
tql/evaluator.py +39 -2
tql/evaluator_components/special_expressions.py +25 -6
tql/evaluator_components/value_comparison.py +31 -3
tql/mutator_analyzer.py +640 -242
tql/mutators/__init__.py +5 -1
tql/mutators/dns.py +76 -53
tql/mutators/security.py +101 -100
tql/mutators/string.py +74 -0
tql/opensearch_components/field_mapping.py +9 -3
tql/opensearch_components/lucene_converter.py +12 -0
tql/opensearch_components/query_converter.py +134 -25
tql/opensearch_mappings.py +2 -2
tql/opensearch_stats.py +170 -39
tql/parser.py +92 -37
tql/parser_components/ast_builder.py +37 -1
tql/parser_components/field_extractor.py +9 -1
tql/parser_components/grammar.py +32 -8
tql/post_processor.py +489 -31
tql/stats_evaluator.py +170 -12
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/LICENSE +0 -0
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/WHEEL +0 -0
{tellaro_query_language-0.2.0.dist-info → tellaro_query_language-0.2.2.dist-info}/entry_points.txt +0 -0

tql/mutators/__init__.py CHANGED Viewed

@@ -26,7 +26,7 @@ from .list import (
 )
 from .network import IsGlobalMutator, IsPrivateMutator
 from .security import DefangMutator, RefangMutator
-from .string import LengthMutator, LowercaseMutator, SplitMutator, TrimMutator, UppercaseMutator
+from .string import LengthMutator, LowercaseMutator, ReplaceMutator, SplitMutator, TrimMutator, UppercaseMutator
 # Maintain backward compatibility
 __all__ = [
@@ -39,6 +39,7 @@ __all__ = [
     "TrimMutator",
     "SplitMutator",
     "LengthMutator",
+    "ReplaceMutator",
     # Encoding mutators
     "Base64EncodeMutator",
     "Base64DecodeMutator",
@@ -82,6 +83,7 @@ ALLOWED_MUTATORS: Dict[str, Optional[Dict[str, type]]] = {
     "trim": None,
     "split": {"delimiter": str, "field": str},
     "length": {"field": str},
+    "replace": {"find": str, "replace": str, "field": str},
     # URL and security transform mutators
     "refang": {"field": str},
     "defang": {"field": str},
@@ -166,6 +168,8 @@ def create_mutator(name: str, params: Optional[List[List[Any]]] = None) -> BaseM
         return SplitMutator(params_dict)
     elif key == "length":
         return LengthMutator(params_dict)
+    elif key == "replace":
+        return ReplaceMutator(params_dict)
     elif key == "refang":
         return RefangMutator(params_dict)
     elif key == "defang":

tql/mutators/dns.py CHANGED Viewed

@@ -31,18 +31,32 @@ class NSLookupMutator(BaseMutator):
     - Perform reverse DNS lookups (IP to hostname)
     - Query specific DNS record types
     - Support force lookup to bypass existing data
-    - Return enriched data without modifying the original field value
+    - Return ECS-compliant DNS data without modifying the original field value
+    Field Storage (ECS-compliant):
+    - destination.ip | nslookup → stores at destination.domain
+    - source.ip | nslookup → stores at source.domain
+    - ip | nslookup → stores at domain
+    - Multiple queries store as array of ECS DNS objects
     Parameters:
         servers: List of DNS server IPs to use (optional)
-        append_field: Field name to store results (default: field_name + '_resolved')
+        field: Field name to store results (default: auto-detect from field path)
+        append_field: Legacy parameter name for field (deprecated)
         force: Force new lookup even if data exists (default: False)
         save: Save enrichment to record (default: True)
         types: List of DNS record types to query (default: auto-detect)
-        field: Field name to store results (preferred over append_field)
-    Example:
-        hostname | nslookup(servers=['8.8.8.8']) contains 'google.com'
+    Examples:
+        # Basic usage with ECS-compliant storage
+        destination.ip | nslookup
+        source.ip | nslookup
+        # Custom DNS servers
+        hostname | nslookup(servers=['8.8.8.8'])
+        # Custom storage location
+        ip | nslookup(field='custom.dns_data')
     """
     def __init__(self, params: Optional[Dict[str, Any]] = None) -> None:
@@ -232,17 +246,43 @@ class NSLookupMutator(BaseMutator):
         # Save enrichment if requested
         if save_enrichment:
-            # For single value lookups, unwrap the result
+            # Always store ECS data directly, never use IP addresses as field names
             if len(queries) == 1 and queries[0] in resolved_results:
-                # Store the ECS data directly, not wrapped in IP key
+                # Single query: store the ECS data directly
                 append_to_result(record, append_field, resolved_results[queries[0]])
+            elif len(queries) > 1:
+                # Multiple queries: store as array of ECS results
+                results_array = []
+                for query in queries:
+                    if query in resolved_results:
+                        results_array.append(resolved_results[query])
+                append_to_result(record, append_field, results_array)
             else:
-                # For multiple queries, keep the dictionary structure
-                append_to_result(record, append_field, resolved_results)
-        # For enrichment-only mode, return the resolved data
-        # This allows it to be used in geo-style parenthetical expressions
-        return resolved_results
+                # No results
+                append_to_result(record, append_field, None)
+        # For enrichment mutators, return data for comparison
+        # The full enrichment data is stored via append_to_result above
+        # Return value is used for field comparison (e.g., contains 'dns.google')
+        if len(queries) == 1 and queries[0] in resolved_results:
+            # Single query: return the first answer for comparison
+            dns_data = resolved_results[queries[0]]
+            answers = dns_data.get("answers", [])
+            return answers[0] if answers else value  # Return first answer or original value
+        elif len(queries) > 1:
+            # Multiple queries: return array of first answers
+            first_answers = []
+            for query in queries:
+                if query in resolved_results:
+                    dns_data = resolved_results[query]
+                    answers = dns_data.get("answers", [])
+                    if answers:
+                        first_answers.append(answers[0])
+            return first_answers if first_answers else value
+        else:
+            # No results: return original value
+            return value
     def _format_dns_ecs(  # noqa: C901
         self, query_value: str, records: List[Dict[str, Any]], query_types: List[str]
@@ -257,60 +297,43 @@ class NSLookupMutator(BaseMutator):
         Returns:
             ECS-compliant DNS data structure
         """
-        # Build ECS structure
+        # Extract answers as simple array of values
+        answers = []
+        ttls = []
+        types = []
+        for record in records:
+            data = record.get("data", "")
+            if data:
+                answers.append(data)
+                ttls.append(record.get("ttl", 0))
+                types.append(record.get("type", ""))
+        # Build clean ECS structure
         ecs_data = {
             "question": {"name": query_value, "type": query_types[0] if query_types else "A"},  # Primary query type
-            "answers": records,
+            "answers": answers,  # Simple array of answer values
             "response_code": "NOERROR" if records else "NXDOMAIN",
         }
-        # Extract specific data for convenience fields
-        resolved_ips = []
-        hostnames = []
-        mx_records = []
-        txt_records = []
+        # Add TTLs if we have them (optional field)
+        if ttls:
+            ecs_data["ttl"] = ttls
+        # Add types if they vary (optional field)
+        if types and len(set(types)) > 1:
+            ecs_data["types"] = types
+        # Extract resolved IPs for ECS standard field
+        resolved_ips = []
         for record in records:
             record_type = record.get("type", "")
             data = record.get("data", "")
             if record_type in ["A", "AAAA"] and data:
                 resolved_ips.append(data)
-            elif record_type == "PTR" and data:
-                hostnames.append(data)
-            elif record_type == "CNAME" and data:
-                hostnames.append(data)
-            elif record_type == "MX" and data:
-                mx_records.append(data)
-            elif record_type == "TXT" and data:
-                txt_records.append(data)
         # Add resolved_ip array (ECS standard field)
         if resolved_ips:
             ecs_data["resolved_ip"] = resolved_ips
-        # Add convenience fields for easier access
-        if hostnames:
-            ecs_data["hostname"] = hostnames[0]  # Single hostname for simple access
-            ecs_data["hostnames"] = hostnames  # Array of all hostnames
-        # Add record type specific arrays for convenience
-        if resolved_ips:
-            # Separate IPv4 and IPv6
-            ipv4 = [ip for ip in resolved_ips if ":" not in ip]
-            ipv6 = [ip for ip in resolved_ips if ":" in ip]
-            if ipv4:
-                ecs_data["a"] = ipv4
-            if ipv6:
-                ecs_data["aaaa"] = ipv6
-        if hostnames and any(r.get("type") == "PTR" for r in records):
-            ecs_data["ptr"] = hostnames[0]  # Backward compatibility
-        if mx_records:
-            ecs_data["mx"] = mx_records
-        if txt_records:
-            ecs_data["txt"] = txt_records
         return ecs_data

tql/mutators/security.py CHANGED Viewed

@@ -60,53 +60,42 @@ class RefangMutator(BaseMutator):
     def _refang_string(self, s: str) -> str:
         """Refang a single string."""
+        import re
         result = s
-        # Apply replacements in specific order to handle spaces properly
-        # First handle patterns with spaces
-        result = result.replace(" [.] ", ".")
-        result = result.replace(" [dot] ", ".")
-        result = result.replace(" [at] ", "@")
-        result = result.replace(" [:] ", ":")
-        # Protocol defanging (various cases)
-        result = result.replace("hxxp://", "http://")
-        result = result.replace("hXXp://", "http://")
-        result = result.replace("HxXp://", "http://")
-        result = result.replace("HxxP://", "http://")
-        result = result.replace("HXXP://", "http://")
-        result = result.replace("hxxps://", "https://")
-        result = result.replace("hXXps://", "https://")
-        result = result.replace("HXXPS://", "https://")
-        result = result.replace("fxp://", "ftp://")
-        result = result.replace("fXp://", "ftp://")
-        result = result.replace("FXP://", "ftp://")
-        # Dot defanging
-        result = result.replace("[.]", ".")
-        result = result.replace("(.)", ".")
-        result = result.replace("{.}", ".")
-        result = result.replace("[dot]", ".")
-        result = result.replace("(dot)", ".")
-        result = result.replace("{dot}", ".")
-        # Colon defanging
-        result = result.replace("[:]", ":")
-        result = result.replace("(:)", ":")
-        result = result.replace("{:}", ":")
-        # At symbol defanging
-        result = result.replace("[at]", "@")
-        result = result.replace("(at)", "@")
-        result = result.replace("{at}", "@")
-        result = result.replace("[@]", "@")
-        result = result.replace("(@)", "@")
-        result = result.replace("{@}", "@")
-        # Slash defanging
-        result = result.replace("[/]", "/")
-        result = result.replace("(/)", "/")
-        result = result.replace("{/}", "/")
+        # Apply replacements for common defanging patterns
+        # Handle various protocol defanging patterns (case insensitive)
+        # Important: Check for 'ps' suffix first to avoid false matches
+        result = re.sub(r"h[xX]{1,2}ps://", "https://", result, flags=re.IGNORECASE)
+        result = re.sub(r"h[xX]{1,2}p://", "http://", result, flags=re.IGNORECASE)
+        result = re.sub(r"f[xX]p://", "ftp://", result, flags=re.IGNORECASE)
+        # Handle bracketed replacements with optional spaces
+        result = re.sub(r"\s*\[\.\]\s*", ".", result)
+        result = re.sub(r"\s*\[:\]\s*", ":", result)
+        result = re.sub(r"\s*\[at\]\s*", "@", result, flags=re.IGNORECASE)
+        result = re.sub(r"\s*\[@\]\s*", "@", result)
+        result = re.sub(r"\s*\[/\]\s*", "/", result)
+        # Handle parentheses replacements
+        result = re.sub(r"\s*\(\.\)\s*", ".", result)
+        result = re.sub(r"\s*\(:\)\s*", ":", result)
+        result = re.sub(r"\s*\(at\)\s*", "@", result, flags=re.IGNORECASE)
+        result = re.sub(r"\s*\(@\)\s*", "@", result)
+        result = re.sub(r"\s*\(/\)\s*", "/", result)
+        # Handle braces replacements
+        result = re.sub(r"\s*\{\.\}\s*", ".", result)
+        result = re.sub(r"\s*\{:\}\s*", ":", result)
+        result = re.sub(r"\s*\{at\}\s*", "@", result, flags=re.IGNORECASE)
+        result = re.sub(r"\s*\{@\}\s*", "@", result)
+        result = re.sub(r"\s*\{/\}\s*", "/", result)
+        # Handle word replacements with optional brackets/parentheses/braces
+        result = re.sub(r"\s*\[dot\]\s*", ".", result, flags=re.IGNORECASE)
+        result = re.sub(r"\s*\(dot\)\s*", ".", result, flags=re.IGNORECASE)
+        result = re.sub(r"\s*\{dot\}\s*", ".", result, flags=re.IGNORECASE)
         return result
@@ -161,65 +150,77 @@ class DefangMutator(BaseMutator):
             # Return the defanged value directly
             return defanged_value
-    def _defang_string(self, s: str) -> str:
+    def _defang_string(self, s: str) -> str:  # noqa: C901
         """Defang a single string."""
-        # Apply defanging patterns
+        import re
         result = s
-        # Protocol defanging (do these first to avoid double-defanging)
-        result = result.replace("https://", "hXXps://")
-        result = result.replace("http://", "hXXp://")
-        result = result.replace("ftp://", "fXp://")
-        result = result.replace("HTTPS://", "HXXPS://")
-        result = result.replace("HTTP://", "HXXP://")
-        result = result.replace("FTP://", "FXP://")
-        # Now defang dots, but not in the protocol part we just defanged
-        # Split by whitespace to handle individual tokens
-        tokens = result.split()
-        defanged_tokens = []
-        for token in tokens:
-            # Check if this is a URL (has protocol)
-            has_protocol = any(
-                token.startswith(p)
-                for p in [
-                    "hXXp://",
-                    "hXXps://",
-                    "fXp://",
-                    "HXXP://",
-                    "HXXPS://",
-                    "FXP://",
-                    "hxxp://",
-                    "hxxps://",
-                    "fxp://",  # Already defanged variations
-                ]
-            )
-            if has_protocol and "://" in token:
-                # For URLs, defang only the domain part
-                protocol, rest = token.split("://", 1)
-                # Only defang if not already defanged
-                if "[.]" not in rest and "[at]" not in rest:
-                    # Defang dots in domain/path
-                    rest = rest.replace(".", "[.]")
+        # Check if fully defanged to avoid double-defanging
+        # Only return early if all components are already defanged
+        has_defanged_protocol = "hxxp" in result.lower() or "fxp" in result.lower()
+        has_defanged_dots = "[.]" in result
+        has_defanged_at = "[at]" in result
+        # If it's a URL with protocol, check if dots are defanged
+        if has_defanged_protocol and "://" in result:
+            # Extract the part after protocol
+            _, after_protocol = result.split("://", 1)
+            # If dots in the URL part are already defanged, return as-is
+            if "." not in after_protocol or has_defanged_dots:
+                return result
+        # For non-URLs, if already has defanged components, return
+        elif has_defanged_dots and has_defanged_at:
+            return result
+        # First, replace protocols (case-insensitive) with lowercase hxxp/hxxps/fxp
+        result = re.sub(r"https://", "hxxps://", result, flags=re.IGNORECASE)
+        result = re.sub(r"http://", "hxxp://", result, flags=re.IGNORECASE)
+        result = re.sub(r"ftp://", "fxp://", result, flags=re.IGNORECASE)
+        # Split the string to process URLs, emails, and domains separately
+        # Match URLs first since they're more specific
+        url_pattern = r"((?:hxxps?|fxp|https?|ftp)://[^\s]+)"
+        parts = re.split(url_pattern, result)
+        defanged_parts = []
+        for i, part in enumerate(parts):
+            if i % 2 == 1:  # This is a URL match
+                # For URLs, defang the domain part only
+                if "://" in part:
+                    protocol, rest = part.split("://", 1)
+                    # Defang dots in the domain/path (avoid double-defanging)
+                    if "[.]" not in rest:
+                        rest = rest.replace(".", "[.]")
                     # Defang @ if present (for URLs with auth)
-                    rest = rest.replace("@", "[at]")
-                    # Defang colons in port numbers
-                    # Only defang colon if it's followed by numbers (port)
-                    import re
+                    if "[at]" not in rest:
+                        rest = rest.replace("@", "[at]")
+                    # Defang colons in port numbers (e.g., :8080)
                     rest = re.sub(r":(\d+)", r"[:]\1", rest)
-                defanged_tokens.append(f"{protocol}://{rest}")
-            else:
-                # For non-URL tokens, defang dots and @ symbols
-                # But avoid double-defanging
-                if "[.]" not in token and "[at]" not in token:
-                    defanged = token.replace(".", "[.]")
-                    defanged = defanged.replace("@", "[at]")
-                    defanged_tokens.append(defanged)
+                    defanged_parts.append(f"{protocol}://{rest}")
                 else:
-                    # Already defanged, leave as-is
-                    defanged_tokens.append(token)
+                    defanged_parts.append(part)
+            else:
+                # For non-URL text, handle email addresses and domain patterns
+                # First, handle email addresses
+                email_pattern = r"([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"
+                part = re.sub(email_pattern, lambda m: f"{m.group(1)}[at]{m.group(2).replace('.', '[.]')}", part)  # type: ignore[arg-type, str-bytes-safe]
+                # Then handle standalone IP addresses
+                ip_pattern = r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b"
+                part = re.sub(ip_pattern, lambda m: m.group(0).replace(".", "[.]"), part)  # type: ignore[arg-type]
+                # Finally handle standalone domain patterns (but not IPs)
+                domain_pattern = r"\b([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+)\b"
+                def defang_domain(match):
+                    domain = match.group(0)
+                    # Only defang if not already defanged and not an IP address
+                    if "[.]" not in domain and not re.match(r"^\d+\.\d+\.\d+\.\d+$", domain):
+                        return domain.replace(".", "[.]")
+                    return domain
+                part = re.sub(domain_pattern, defang_domain, part)
+                defanged_parts.append(part)
-        return " ".join(defanged_tokens)
+        return "".join(defanged_parts)

tql/mutators/string.py CHANGED Viewed

@@ -163,3 +163,77 @@ class LengthMutator(BaseMutator):
         else:
             # Return the length value directly
             return length_value
+class ReplaceMutator(BaseMutator):
+    """Mutator that replaces all occurrences of a string with another string.
+    Performance Characteristics:
+    - In-memory: FAST - Simple string operation with minimal overhead
+    - OpenSearch: MODERATE - Requires post-processing of all results
+    Parameters:
+    - find: The string to find (required)
+    - replace: The string to replace with (required)
+    - field: Optional field to append result to
+    Examples:
+        # Replace all occurrences
+        field | replace(find='old', replace='new')
+        # Use as a filter
+        field | replace(find='error', replace='warning') contains 'warning'
+        # Append to another field
+        field | replace(find='/', replace='_', field='sanitized_field')
+    """
+    def __init__(self, params: Optional[Dict[str, Any]] = None) -> None:
+        super().__init__(params)
+        self.performance_in_memory = PerformanceClass.FAST
+        self.performance_opensearch = PerformanceClass.MODERATE
+        # Validate required parameters
+        if not params:
+            raise ValueError("Replace mutator requires 'find' and 'replace' parameters")
+        if "find" not in params:
+            raise ValueError("Replace mutator requires 'find' parameter")
+        if "replace" not in params:
+            raise ValueError("Replace mutator requires 'replace' parameter")
+    def apply(self, field_name: str, record: Dict[str, Any], value: Any) -> Any:
+        """Apply the replace transformation."""
+        find_str = str(self.params["find"])
+        replace_str = str(self.params["replace"])
+        append_field = self.params.get("field")
+        # Perform the replace operation
+        result: Any  # Declare result with Any type to handle different types
+        if value is None:
+            # Handle None - return as is
+            result = value
+        elif isinstance(value, str):
+            result = value.replace(find_str, replace_str)
+        elif isinstance(value, (list, tuple)):
+            # Apply replace to each string element in the array
+            result = []
+            for item in value:
+                if isinstance(item, str):
+                    result.append(item.replace(find_str, replace_str))
+                else:
+                    # Keep non-string items as-is
+                    result.append(item)
+        elif isinstance(value, (int, float, bool)):
+            # Convert to string first, then replace, then keep as string
+            result = str(value).replace(find_str, replace_str)
+        else:
+            # For other types, return as-is
+            result = value
+        # If append_field is specified, add to record and return original value
+        if append_field:
+            append_to_result(record, append_field, result)
+            return value
+        else:
+            # Return the replaced result directly
+            return result

tql/opensearch_components/field_mapping.py CHANGED Viewed

@@ -67,7 +67,9 @@ class FieldMapping:
                 self.field_types[self.base_field_name] = "keyword"
             elif base_type == "text":
                 analyzer = mapping_info.get("analyzer", "standard")
-                self.text_fields[analyzer] = self.base_field_name
+                # If analyzer is a dict (custom analyzer), use "custom" as key
+                analyzer_key = "custom" if isinstance(analyzer, dict) else analyzer
+                self.text_fields[analyzer_key] = self.base_field_name
                 self.field_types[self.base_field_name] = "text"
             else:
                 self.field_types[self.base_field_name] = base_type
@@ -85,7 +87,9 @@ class FieldMapping:
                         self.field_types[field_path] = "keyword"
                     elif subfield_type == "text":
                         analyzer = subfield_config.get("analyzer", "standard")
-                        self.text_fields[analyzer] = field_path
+                        # If analyzer is a dict (custom analyzer), use "custom" as key
+                        analyzer_key = "custom" if isinstance(analyzer, dict) else analyzer
+                        self.text_fields[analyzer_key] = field_path
                         self.field_types[field_path] = "text"
                     elif subfield_type:
                         self.field_types[field_path] = subfield_type
@@ -114,7 +118,9 @@ class FieldMapping:
                     if field_type == "keyword":
                         self.keyword_field = field_name
                     elif field_type == "text":
-                        self.text_fields[analyzer] = field_name
+                        # If analyzer is a dict (custom analyzer), use "custom" as key
+                        analyzer_key = "custom" if isinstance(analyzer, dict) else analyzer
+                        self.text_fields[analyzer_key] = field_name
                 else:
                     # Legacy format: "keyword" or "text" or other types
                     field_type = field_config

tql/opensearch_components/lucene_converter.py CHANGED Viewed

@@ -42,6 +42,18 @@ class LuceneConverter:
                 return self._convert_unary_op_to_lucene(node)
             elif node_type == "collection_op":
                 return self._convert_collection_op_to_lucene(node)
+            elif node_type == "query_with_stats":
+                # For query_with_stats, only convert the filter part to Lucene
+                # The stats part is handled by the stats engine
+                filter_node = node.get("filter")
+                if filter_node:
+                    return self._convert_node_to_lucene(filter_node)
+                else:
+                    return "*:*"
+            elif node_type == "stats_expr":
+                # Pure stats queries match all documents in Lucene
+                # The aggregations are handled by the stats engine
+                return "*:*"
         raise TQLValidationError(f"Unknown node type: {node}")

tellaro-query-language 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

tellaro-query-language 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl