PyPI - tellaro-query-language - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

tellaro-query-language 0.2.3py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

tellaro_query_language-0.2.6.dist-info/LICENSE +72 -0
tellaro_query_language-0.2.6.dist-info/METADATA +806 -0
{tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/RECORD +23 -20
{tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/entry_points.txt +1 -0
tql/cache/base.py +36 -2
tql/cache/memory.py +53 -6
tql/cache/redis.py +52 -11
tql/cli.py +484 -0
tql/core.py +244 -5
tql/evaluator.py +1 -1
tql/evaluator_components/special_expressions.py +62 -10
tql/evaluator_components/value_comparison.py +0 -4
tql/exceptions.py +6 -4
tql/field_type_inference.py +285 -0
tql/mutators/geo.py +57 -20
tql/opensearch_components/query_converter.py +1 -1
tql/opensearch_stats.py +7 -6
tql/parser.py +7 -3
tql/post_processor.py +8 -4
tql/scripts.py +3 -3
tql/stats_evaluator.py +357 -5
tql/streaming_file_processor.py +335 -0
tellaro_query_language-0.2.3.dist-info/LICENSE +0 -21
tellaro_query_language-0.2.3.dist-info/METADATA +0 -433
{tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/WHEEL +0 -0

tql/core.py CHANGED Viewed

@@ -4,12 +4,19 @@ This module provides the main TQL class that serves as the primary interface
 for parsing and executing TQL queries against different backends.
 """
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Generator, List, Optional, Union
 from .analyzer import EnhancedFieldMapping
 from .core_components import FileOperations, OpenSearchOperations, StatsOperations, ValidationOperations
 from .evaluator import TQLEvaluator
-from .exceptions import TQLOperatorError, TQLParseError, TQLSyntaxError, TQLTypeError, TQLValidationError
+from .exceptions import (
+    TQLExecutionError,
+    TQLOperatorError,
+    TQLParseError,
+    TQLSyntaxError,
+    TQLTypeError,
+    TQLValidationError,
+)
 from .mutator_analyzer import MutatorAnalysisResult
 from .parser import TQLParser
 from .stats_evaluator import TQLStatsEvaluator
@@ -27,7 +34,7 @@ class TQL:
         >>> results = tql.query(data, query)
     """
-    def __init__(self, field_mappings: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None):
+    def __init__(self, field_mappings: Optional[Dict[str, Union[str, Dict[str, Any]]]] = None):  # noqa: C901
         """Initialize TQL instance.
         Args:
@@ -109,8 +116,9 @@ class TQL:
                     else:
                         # Find primary field (keys without dots, not starting with underscore)
                         primary_fields = [
-                            field_key for field_key in v.keys()
-                            if '.' not in field_key and not field_key.startswith('_')
+                            field_key
+                            for field_key in v.keys()
+                            if "." not in field_key and not field_key.startswith("_")
                         ]
                         if primary_fields:
@@ -1049,6 +1057,237 @@ class TQL:
         """
         return self.stats_ops.analyze_stats_query(query)
+    def query_file_streaming(
+        self,
+        file_path: str,
+        query: str,
+        input_format: str = "auto",
+        csv_delimiter: str = ",",
+        csv_headers: Optional[List[str]] = None,
+        no_header: bool = False,
+        field_types: Optional[Dict[str, str]] = None,
+        sample_size: int = 100,
+    ) -> Generator[Dict[str, Any], None, None]:
+        """Execute a TQL query against a file in streaming mode.
+        This method processes files line-by-line with minimal memory usage,
+        yielding matching records as they are found.
+        Args:
+            file_path: Path to file
+            query: TQL query string (filter query only, not stats)
+            input_format: File format ('json', 'jsonl', 'csv', 'auto')
+            csv_delimiter: CSV delimiter character
+            csv_headers: Manual CSV header names
+            no_header: Force CSV to be treated as having no header
+            field_types: Manual field type mappings
+            sample_size: Number of records to sample for type inference
+        Yields:
+            Matching records as dictionaries
+        Raises:
+            TQLParseError: If query parsing fails
+            TQLExecutionError: If file processing fails
+        """
+        from .streaming_file_processor import StreamingFileProcessor
+        # Parse the query
+        ast = self.parse(query)
+        # Validate query type (only filter queries supported for streaming)
+        query_type = ast.get("type")
+        if query_type in ["stats_expr", "query_with_stats"]:
+            raise TQLExecutionError("Stats queries not supported in streaming mode. Use query_file_stats() instead.")
+        # Create streaming processor
+        processor = StreamingFileProcessor(
+            sample_size=sample_size,
+            csv_delimiter=csv_delimiter,
+            field_types=field_types,
+            csv_headers=csv_headers,
+            no_header=no_header,
+        )
+        # Process file and evaluate query on each record
+        for record in processor.process_file(file_path, input_format):
+            if self.evaluator._evaluate_node(ast, record, self._simple_mappings):
+                yield record
+    def query_file_stats(
+        self,
+        file_path: str,
+        query: str,
+        input_format: str = "auto",
+        csv_delimiter: str = ",",
+        csv_headers: Optional[List[str]] = None,
+        no_header: bool = False,
+        field_types: Optional[Dict[str, str]] = None,
+        sample_size: int = 100,
+    ) -> Dict[str, Any]:
+        """Execute a TQL stats query against a file in streaming mode.
+        This method processes files line-by-line with accumulator-based stats
+        calculations for memory efficiency.
+        Args:
+            file_path: Path to file
+            query: TQL query string (can include filters and stats)
+            input_format: File format ('json', 'jsonl', 'csv', 'auto')
+            csv_delimiter: CSV delimiter character
+            csv_headers: Manual CSV header names
+            no_header: Force CSV to be treated as having no header
+            field_types: Manual field type mappings
+            sample_size: Number of records to sample for type inference
+        Returns:
+            Dictionary containing aggregation results
+        Raises:
+            TQLParseError: If query parsing fails
+            TQLExecutionError: If file processing fails
+        """
+        from .streaming_file_processor import StreamingFileProcessor
+        # Parse the query
+        ast = self.parse(query)
+        query_type = ast.get("type")
+        # Create streaming processor
+        processor = StreamingFileProcessor(
+            sample_size=sample_size,
+            csv_delimiter=csv_delimiter,
+            field_types=field_types,
+            csv_headers=csv_headers,
+            no_header=no_header,
+        )
+        # Handle different query types
+        if query_type == "stats_expr":
+            # Pure stats query - process all records
+            record_iter = processor.process_file(file_path, input_format)
+            return self.stats_evaluator.evaluate_stats_streaming(record_iter, ast, self.field_mappings)
+        elif query_type == "query_with_stats":
+            # Filter + stats query
+            filter_ast = ast["filter"]
+            stats_ast = ast["stats"]
+            # Create filtered iterator
+            def filtered_records():
+                for record in processor.process_file(file_path, input_format):
+                    if self.evaluator._evaluate_node(filter_ast, record, self._simple_mappings):
+                        yield record
+            return self.stats_evaluator.evaluate_stats_streaming(filtered_records(), stats_ast, self.field_mappings)
+        else:
+            # Regular filter query - shouldn't use stats method
+            raise TQLExecutionError("Use query_file_streaming() for filter queries without stats aggregations.")
+    def query_folder(
+        self,
+        folder_path: str,
+        query: str,
+        pattern: str = "*",
+        input_format: str = "auto",
+        recursive: bool = False,
+        parallel: int = 4,
+        csv_delimiter: str = ",",
+        csv_headers: Optional[List[str]] = None,
+        no_header: bool = False,
+        field_types: Optional[Dict[str, str]] = None,
+        sample_size: int = 100,
+    ) -> Dict[str, Any]:
+        """Execute a TQL query against multiple files in a folder.
+        This method processes all matching files and aggregates results,
+        supporting both filter queries (with records) and stats queries.
+        Args:
+            folder_path: Path to folder
+            query: TQL query string
+            pattern: Glob pattern for file matching
+            input_format: File format ('json', 'jsonl', 'csv', 'auto')
+            recursive: Process subdirectories recursively
+            parallel: Number of parallel workers
+            csv_delimiter: CSV delimiter character
+            csv_headers: Manual CSV header names
+            no_header: Force CSV to be treated as having no header
+            field_types: Manual field type mappings
+            sample_size: Number of records to sample for type inference
+        Returns:
+            Dictionary containing results and/or stats aggregated across all files
+        Raises:
+            TQLParseError: If query parsing fails
+            TQLExecutionError: If folder processing fails
+        """
+        from .streaming_file_processor import StreamingFileProcessor
+        # Parse the query
+        ast = self.parse(query)
+        query_type = ast.get("type")
+        # Create streaming processor
+        processor = StreamingFileProcessor(
+            sample_size=sample_size,
+            csv_delimiter=csv_delimiter,
+            field_types=field_types,
+            csv_headers=csv_headers,
+            no_header=no_header,
+        )
+        # Process folder based on query type
+        if query_type == "stats_expr":
+            # Pure stats query - aggregate across all files
+            def all_records():
+                for _file_path, record in processor.process_folder(
+                    folder_path, pattern, input_format, recursive, parallel
+                ):
+                    yield record
+            stats_result = self.stats_evaluator.evaluate_stats_streaming(all_records(), ast, self.field_mappings)
+            return {"stats": stats_result, "files_processed": "multiple"}
+        elif query_type == "query_with_stats":
+            # Filter + stats query
+            filter_ast = ast["filter"]
+            stats_ast = ast["stats"]
+            def filtered_records():
+                for _file_path, record in processor.process_folder(
+                    folder_path, pattern, input_format, recursive, parallel
+                ):
+                    if self.evaluator._evaluate_node(filter_ast, record, self._simple_mappings):
+                        yield record
+            stats_result = self.stats_evaluator.evaluate_stats_streaming(
+                filtered_records(), stats_ast, self.field_mappings
+            )
+            return {"stats": stats_result, "files_processed": "multiple"}
+        else:
+            # Regular filter query - collect matching records from all files
+            matched_records = []
+            files_processed = 0
+            files_with_matches = 0
+            for file_path, record in processor.process_folder(folder_path, pattern, input_format, recursive, parallel):
+                files_processed += 1
+                if self.evaluator._evaluate_node(ast, record, self._simple_mappings):
+                    matched_records.append({"_source_file": file_path, **record})
+                    files_with_matches += 1
+            return {
+                "results": matched_records,
+                "total": len(matched_records),
+                "files_processed": files_processed,
+                "files_with_matches": files_with_matches,
+            }
     def _apply_mutators_to_record(self, ast: Dict[str, Any], record: Dict[str, Any]) -> Dict[str, Any]:
         """Apply any mutators in the AST to enrich the record.

tql/evaluator.py CHANGED Viewed

@@ -67,7 +67,7 @@ class TQLEvaluator:
         field_mappings = field_mappings or {}
         return self._evaluate_node(ast, record, field_mappings)
-    def _evaluate_node(self, node: Any, record: Dict[str, Any], field_mappings: Dict[str, str]) -> bool:
+    def _evaluate_node(self, node: Any, record: Dict[str, Any], field_mappings: Dict[str, str]) -> bool:  # noqa: C901
         """Evaluate a single AST node against a record.
         Args:

tql/evaluator_components/special_expressions.py CHANGED Viewed

@@ -15,15 +15,27 @@ class SpecialExpressionEvaluator:
     # Sentinel value to distinguish missing fields from None values
     _MISSING_FIELD = object()
-    def __init__(self, get_field_value_func, evaluate_node_func):
+    def __init__(self, get_field_value_func, evaluate_node_func, set_field_value_func=None):
         """Initialize the special expression evaluator.
         Args:
             get_field_value_func: Function to get field values from records
             evaluate_node_func: Function to evaluate AST nodes
+            set_field_value_func: Optional function to set field values in records
         """
         self._get_field_value = get_field_value_func
         self._evaluate_node = evaluate_node_func
+        self._set_field_value = set_field_value_func or self._default_set_field_value
+    def _default_set_field_value(self, record: Dict[str, Any], field_path: str, value: Any) -> None:
+        """Default implementation of set_field_value for nested field assignment."""
+        parts = field_path.split(".")
+        current = record
+        for part in parts[:-1]:
+            if part not in current:
+                current[part] = {}
+            current = current[part]
+        current[parts[-1]] = value
     def evaluate_geo_expr(  # noqa: C901
         self, node: Dict[str, Any], record: Dict[str, Any], field_mappings: Dict[str, str]
@@ -106,19 +118,26 @@ class SpecialExpressionEvaluator:
                     elif "as" in record:
                         geo_data["as"] = record["as"]
         else:
-            # Default locations
+            # Default locations (ECS style)
             if "." in actual_field:
-                # For nested fields like destination.ip, check destination.geo
+                # For nested fields like destination.ip, check destination.geo and destination.as
                 parent_path = actual_field.rsplit(".", 1)[0]
                 parent = self._get_field_value(record, parent_path)
-                if isinstance(parent, dict) and "geo" in parent:
-                    # Found geo data under parent
-                    geo_data = parent
+                if isinstance(parent, dict) and ("geo" in parent or "as" in parent):
+                    # Found geo/as data under parent
+                    geo_data = {}
+                    if "geo" in parent:
+                        geo_data["geo"] = parent["geo"]
+                    if "as" in parent:
+                        geo_data["as"] = parent["as"]
             else:
-                # For top-level fields, check enrichment.geo
-                if "enrichment" in record and isinstance(record["enrichment"], dict):
-                    if "geo" in record["enrichment"]:
-                        geo_data = record["enrichment"]
+                # For top-level fields like ip, check top-level geo and as fields (ECS style)
+                if "geo" in record or "as" in record:
+                    geo_data = {}
+                    if "geo" in record:
+                        geo_data["geo"] = record["geo"]
+                    if "as" in record:
+                        geo_data["as"] = record["as"]
         # Check if we should use existing geo data or force a new lookup
         force_lookup = geo_params.get("force", False)
@@ -148,6 +167,39 @@ class SpecialExpressionEvaluator:
             # Apply geo lookup
             geo_data = apply_mutators(field_value, [geo_mutator], actual_field, record)
+        # Always include enrichment in query results (save=True adds to record for output)
+        # Note: This does not modify source files - enrichment only appears in query results
+        save_enrichment = geo_params.get("save", True)
+        if save_enrichment and geo_data and isinstance(geo_data, dict):
+            # Determine where to save the enrichment
+            if custom_field:
+                # Save to custom field location
+                self._set_field_value(record, custom_field, geo_data.get("geo"))
+                if "as" in geo_data:
+                    # Save AS data as sibling to geo field
+                    if "." in custom_field:
+                        as_parent_path = custom_field.rsplit(".", 1)[0]
+                        parent = self._get_field_value(record, as_parent_path)
+                        if isinstance(parent, dict):
+                            parent["as"] = geo_data["as"]
+                    else:
+                        record["as"] = geo_data["as"]
+            elif "." in actual_field:
+                # For nested fields like destination.ip, save to destination.geo and destination.as (ECS style)
+                parent_path = actual_field.rsplit(".", 1)[0]
+                parent = self._get_field_value(record, parent_path)
+                if isinstance(parent, dict):
+                    if "geo" in geo_data:
+                        parent["geo"] = geo_data["geo"]
+                    if "as" in geo_data:
+                        parent["as"] = geo_data["as"]
+            else:
+                # For top-level fields like ip, save to top-level geo and as fields (ECS style)
+                if "geo" in geo_data:
+                    record["geo"] = geo_data["geo"]
+                if "as" in geo_data:
+                    record["as"] = geo_data["as"]
         # Now evaluate the conditions against the geo data
         if conditions:
             # Handle None geo_data (e.g., private IPs or lookup failures)

tql/evaluator_components/value_comparison.py CHANGED Viewed

@@ -85,10 +85,6 @@ class ValueComparator:
                 return False
         # Convert numeric strings to numbers for comparison
-        # IMPORTANT: Store original values to check if conversion succeeded
-        field_value_original = field_value
-        expected_value_original = expected_value
         field_value = self._convert_numeric(field_value)
         expected_value = self._convert_numeric(expected_value)

tql/exceptions.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Optional
 class TQLError(Exception):
     """Base exception class for all TQL errors."""
-    def __init__(
+    def __init__(  # noqa: B042
         self,
         message: str,
         position: Optional[int] = None,
@@ -72,7 +72,7 @@ class TQLParseError(TQLError):
 class TQLTypeError(TQLError):
     """Raised when an operator is incompatible with a field's data type."""
-    def __init__(
+    def __init__(  # noqa: B042
         self, field: str, field_type: str, operator: str, valid_operators: Optional[List[str]] = None, **kwargs
     ):
         """Initialize type error with field and operator context."""
@@ -98,7 +98,7 @@ class TQLTypeError(TQLError):
 class TQLFieldError(TQLError):
     """Raised when referencing invalid or non-existent fields."""
-    def __init__(self, field: str, available_fields: Optional[List[str]] = None, **kwargs):
+    def __init__(self, field: str, available_fields: Optional[List[str]] = None, **kwargs):  # noqa: B042
         """Initialize field error with available fields context."""
         message = f"Unknown field '{field}'."
@@ -147,7 +147,9 @@ class TQLConfigError(TQLError):
 class TQLMutatorError(TQLError):
     """Raised when there's an error applying a mutator."""
-    def __init__(self, mutator_name: str, field_name: str, value_type: str, message: Optional[str] = None, **kwargs):
+    def __init__(  # noqa: B042
+        self, mutator_name: str, field_name: str, value_type: str, message: Optional[str] = None, **kwargs
+    ):
         """Initialize mutator error with context."""
         if not message:
             message = (

tellaro-query-language 0.2.3__py3-none-any.whl → 0.2.6__py3-none-any.whl

tellaro-query-language 0.2.3py3-none-any.whl → 0.2.6py3-none-any.whl