PyPI - tellaro-query-language - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

tellaro-query-language 0.2.3py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

tellaro_query_language-0.2.6.dist-info/LICENSE +72 -0
tellaro_query_language-0.2.6.dist-info/METADATA +806 -0
{tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/RECORD +23 -20
{tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/entry_points.txt +1 -0
tql/cache/base.py +36 -2
tql/cache/memory.py +53 -6
tql/cache/redis.py +52 -11
tql/cli.py +484 -0
tql/core.py +244 -5
tql/evaluator.py +1 -1
tql/evaluator_components/special_expressions.py +62 -10
tql/evaluator_components/value_comparison.py +0 -4
tql/exceptions.py +6 -4
tql/field_type_inference.py +285 -0
tql/mutators/geo.py +57 -20
tql/opensearch_components/query_converter.py +1 -1
tql/opensearch_stats.py +7 -6
tql/parser.py +7 -3
tql/post_processor.py +8 -4
tql/scripts.py +3 -3
tql/stats_evaluator.py +357 -5
tql/streaming_file_processor.py +335 -0
tellaro_query_language-0.2.3.dist-info/LICENSE +0 -21
tellaro_query_language-0.2.3.dist-info/METADATA +0 -433
{tellaro_query_language-0.2.3.dist-info → tellaro_query_language-0.2.6.dist-info}/WHEEL +0 -0

tql/field_type_inference.py ADDED Viewed

@@ -0,0 +1,285 @@
+"""Field type inference for CSV and other structured data files.
+This module provides utilities to automatically detect field types from sample data,
+supporting various data formats including numbers, booleans, dates, and strings.
+"""
+import re
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+class FieldTypeInferencer:
+    """Infers field types from sample data records."""
+    # Common date/timestamp patterns
+    DATE_PATTERNS = [
+        (r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", "%Y-%m-%dT%H:%M:%S"),  # ISO 8601
+        (r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", "%Y-%m-%d %H:%M:%S"),  # SQL datetime
+        (r"^\d{4}-\d{2}-\d{2}", "%Y-%m-%d"),  # Date only
+        (r"^\d{2}/\d{2}/\d{4}", "%m/%d/%Y"),  # US date
+        (r"^\d{2}-\d{2}-\d{4}", "%d-%m-%Y"),  # EU date
+    ]
+    # Boolean value mappings
+    BOOLEAN_VALUES = {
+        "true": True,
+        "false": False,
+        "yes": True,
+        "no": False,
+        "t": True,
+        "f": False,
+        "y": True,
+        "n": False,
+        "1": True,
+        "0": False,
+        "on": True,
+        "off": False,
+    }
+    def __init__(self, sample_size: int = 100):
+        """Initialize the inferencer.
+        Args:
+            sample_size: Number of records to sample for type inference
+        """
+        self.sample_size = sample_size
+    def infer_from_records(
+        self, records: List[Dict[str, Any]], field_overrides: Optional[Dict[str, str]] = None
+    ) -> Dict[str, str]:
+        """Infer field types from a list of records.
+        Args:
+            records: List of dictionaries representing records
+            field_overrides: Optional manual field type overrides
+        Returns:
+            Dictionary mapping field names to inferred types
+        """
+        if not records:
+            return {}
+        # Sample records if we have more than sample_size
+        sample = records[: self.sample_size] if len(records) > self.sample_size else records
+        # Collect all field names
+        all_fields: set[str] = set()
+        for record in sample:
+            all_fields.update(record.keys())
+        # Infer type for each field
+        field_types = {}
+        for field in all_fields:
+            # Check for manual override first
+            if field_overrides and field in field_overrides:
+                field_types[field] = field_overrides[field]
+            else:
+                field_types[field] = self._infer_field_type(field, sample)
+        return field_types
+    def _infer_field_type(self, field: str, records: List[Dict[str, Any]]) -> str:  # noqa: C901
+        """Infer the type of a single field from sample records.
+        Args:
+            field: Field name
+            records: Sample records
+        Returns:
+            Inferred type string ('integer', 'float', 'boolean', 'date', 'string')
+        """
+        # Collect non-null values
+        values = []
+        for record in records:
+            if field in record and record[field] is not None and record[field] != "":
+                values.append(record[field])
+        if not values:
+            return "string"  # Default for empty fields
+        # If values are already typed (from JSON), use those types
+        if all(isinstance(v, bool) for v in values):
+            return "boolean"
+        if all(isinstance(v, int) for v in values):
+            return "integer"
+        if all(isinstance(v, (int, float)) for v in values):
+            return "float"
+        # For string values, try to infer more specific types
+        string_values = [str(v) for v in values]
+        # Try boolean detection
+        if self._is_boolean_field(string_values):
+            return "boolean"
+        # Try integer detection
+        if self._is_integer_field(string_values):
+            return "integer"
+        # Try float detection
+        if self._is_float_field(string_values):
+            return "float"
+        # Try date detection
+        if self._is_date_field(string_values):
+            return "date"
+        # Default to string
+        return "string"
+    def _is_boolean_field(self, values: List[str]) -> bool:
+        """Check if values represent boolean data."""
+        # At least 80% of values should be recognizable boolean values
+        boolean_count = sum(1 for v in values if v.lower() in self.BOOLEAN_VALUES)
+        return boolean_count / len(values) >= 0.8
+    def _is_integer_field(self, values: List[str]) -> bool:
+        """Check if values represent integer data."""
+        try:
+            for v in values:
+                int(v)
+            return True
+        except (ValueError, TypeError):
+            return False
+    def _is_float_field(self, values: List[str]) -> bool:
+        """Check if values represent floating point data."""
+        try:
+            for v in values:
+                float(v)
+            return True
+        except (ValueError, TypeError):
+            return False
+    def _is_date_field(self, values: List[str]) -> bool:
+        """Check if values represent date/timestamp data."""
+        # Try each date pattern
+        match_counts = []
+        for pattern, _ in self.DATE_PATTERNS:
+            matches = sum(1 for v in values if re.match(pattern, v))
+            match_counts.append(matches)
+        # If any pattern matches at least 80% of values, consider it a date field
+        best_match_rate = max(match_counts) / len(values) if match_counts else 0
+        return best_match_rate >= 0.8
+    def detect_csv_headers(self, first_row: List[str], second_row: List[str]) -> bool:
+        """Detect if the first row of a CSV is a header row.
+        Uses heuristics to determine if first row looks like column names.
+        Args:
+            first_row: First row values
+            second_row: Second row values
+        Returns:
+            True if first row appears to be headers
+        """
+        if not first_row or not second_row:
+            return False
+        # Heuristic 1: First row all strings, second row has numbers
+        first_row_all_alpha = all(self._is_mostly_alpha(val) for val in first_row)
+        second_row_has_numbers = any(self._is_numeric(val) for val in second_row)
+        if first_row_all_alpha and second_row_has_numbers:
+            return True
+        # Heuristic 2: First row has no duplicates (headers should be unique)
+        if len(first_row) != len(set(first_row)):
+            return False
+        # Heuristic 3: First row values look like identifiers (snake_case, camelCase, etc.)
+        identifier_pattern = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
+        identifier_count = sum(1 for val in first_row if identifier_pattern.match(val))
+        if identifier_count / len(first_row) >= 0.7:
+            return True
+        # Heuristic 4: Second row has more varied types than first row
+        first_row_types = self._get_value_types(first_row)
+        second_row_types = self._get_value_types(second_row)
+        if len(second_row_types) > len(first_row_types):
+            return True
+        # Default to no header if inconclusive
+        return False
+    def _is_mostly_alpha(self, value: str) -> bool:
+        """Check if value is mostly alphabetic (for header detection)."""
+        if not value:
+            return False
+        alpha_count = sum(1 for c in value if c.isalpha() or c in "_- ")
+        return alpha_count / len(value) >= 0.5
+    def _is_numeric(self, value: str) -> bool:
+        """Check if value is numeric."""
+        try:
+            float(value)
+            return True
+        except (ValueError, TypeError):
+            return False
+    def _get_value_types(self, values: List[str]) -> set:
+        """Get set of value types in a list."""
+        types = set()
+        for val in values:
+            if self._is_numeric(val):
+                types.add("numeric")
+            elif val.lower() in self.BOOLEAN_VALUES:
+                types.add("boolean")
+            elif any(re.match(pattern, val) for pattern, _ in self.DATE_PATTERNS):
+                types.add("date")
+            else:
+                types.add("string")
+        return types
+    def convert_value(self, value: Any, field_type: str) -> Any:
+        """Convert a value to its inferred type.
+        Args:
+            value: Raw value (usually string from CSV)
+            field_type: Target type
+        Returns:
+            Converted value
+        """
+        if value is None or value == "":
+            return None
+        try:
+            if field_type == "integer":
+                return int(value)
+            elif field_type == "float":
+                return float(value)
+            elif field_type == "boolean":
+                str_val = str(value).lower()
+                return self.BOOLEAN_VALUES.get(str_val, bool(value))
+            elif field_type == "date":
+                # Try to parse as datetime
+                return self._parse_date(str(value))
+            else:
+                return str(value)
+        except (ValueError, TypeError):
+            # If conversion fails, return as string
+            return str(value)
+    def _parse_date(self, value: str) -> Optional[str]:
+        """Parse date string and return in ISO format.
+        Args:
+            value: Date string
+        Returns:
+            ISO formatted date string or original if parsing fails
+        """
+        for pattern, date_format in self.DATE_PATTERNS:
+            if re.match(pattern, value):
+                try:
+                    dt = datetime.strptime(value, date_format)
+                    return dt.isoformat()
+                except ValueError:
+                    continue
+        # Return original if no pattern matches
+        return value

tql/mutators/geo.py CHANGED Viewed

@@ -31,11 +31,22 @@ class GeoIPResolver:
         self.mmdb_readers = self._load_mmdb_files()
     def _load_mmdb_files(self) -> Dict[str, Any]:  # noqa: C901
-        """Load MMDB files with smart detection."""
-        # Check for explicit full DB path first
-        if self.config.get("db_path") and os.path.exists(self.config["db_path"]):
+        """Load MMDB files with smart detection.
+        Supported environment variables (in priority order):
+        - TQL_GEOIP_DB_PATH: Full path to combined database (City, Country, ASN all-in-one)
+        - TQL_GEOIP_DB_CITY_PATH: Path to City database
+        - TQL_GEOIP_DB_COUNTRY_PATH: Path to Country database
+        - TQL_GEOIP_DB_ASN_PATH: Path to ASN database
+        - TQL_GEOIP_MMDB_PATH: Base directory for auto-detection (default: /usr/share/geoip)
+        """
+        # Check for explicit full DB path first (from config or environment variable)
+        db_path = self.config.get("db_path") or os.environ.get("TQL_GEOIP_DB_PATH")
+        if db_path:
+            # Expand user home directory (~)
+            db_path = os.path.expanduser(db_path)
+        if db_path and os.path.exists(db_path):
             # Detect DB type from filename
-            db_path = self.config["db_path"]
             db_lower = db_path.lower()
             if "dbip" in db_lower or "db-ip" in db_lower:
                 self.db_type = "dbip"
@@ -53,6 +64,7 @@ class GeoIPResolver:
         # Check base path for auto-detection
         base_path = self.config.get("base_path", os.environ.get("TQL_GEOIP_MMDB_PATH", "/usr/share/geoip"))
+        base_path = os.path.expanduser(base_path)
         # Priority order for database detection
         db_patterns: List[Dict[str, Any]] = [
@@ -100,11 +112,19 @@ class GeoIPResolver:
                         self.mmdb_type = pattern["mmdb_type"]
                         return {"full": maxminddb.open_database(path)}
             else:
-                # Multiple files needed
+                # Multiple files needed - check config, environment variables, then base_path
                 readers = {}
                 all_found = True
                 for db_type, filename in pattern["files"].items():
-                    path = self.config.get(f"{db_type}_db") or os.path.join(base_path, filename)
+                    # Priority: config > environment variable > base_path/filename
+                    env_var_name = f"TQL_GEOIP_DB_{db_type.upper()}_PATH"
+                    path = (
+                        self.config.get(f"{db_type}_db")
+                        or os.environ.get(env_var_name)
+                        or os.path.join(base_path, filename)
+                    )
+                    # Expand user home directory (~)
+                    path = os.path.expanduser(path)
                     if os.path.exists(path):
                         readers[db_type] = maxminddb.open_database(path)
                     else:
@@ -176,6 +196,7 @@ class GeoIPLookupMutator(BaseMutator):
     # Class-level cache and resolver
     _cache_manager: Optional[CacheManager] = None
     _geo_resolver: Optional[GeoIPResolver] = None
+    _geo_resolvers: Dict[str, GeoIPResolver] = {}
     def __init__(self, params: Optional[Dict[str, Any]] = None) -> None:
         super().__init__(params)
@@ -223,10 +244,27 @@ class GeoIPLookupMutator(BaseMutator):
     @classmethod
     def get_geo_resolver(cls, config: Optional[Dict[str, str]] = None) -> GeoIPResolver:
-        """Get or create the GeoIP resolver."""
-        if cls._geo_resolver is None:
-            cls._geo_resolver = GeoIPResolver(config)
-        return cls._geo_resolver
+        """Get or create the GeoIP resolver.
+        Note: Resolver is cached at class level per unique configuration.
+        Config with explicit paths will be cached separately from environment variable configs.
+        Environment variable changes require process restart to take effect.
+        """
+        # Create cache key based on config
+        if config is None:
+            cache_key = "env_vars"  # Uses environment variables
+        else:
+            # Use config values as cache key
+            cache_key = str(sorted(config.items()))
+        # Check if we have a cached resolver for this config
+        if not hasattr(cls, "_geo_resolvers"):
+            cls._geo_resolvers = {}
+        if cache_key not in cls._geo_resolvers:
+            cls._geo_resolvers[cache_key] = GeoIPResolver(config)
+        return cls._geo_resolvers[cache_key]
     def _get_field_value(self, record: Dict[str, Any], field_path: str) -> Any:
         """Get a field value from a record, supporting nested fields."""
@@ -274,7 +312,7 @@ class GeoIPLookupMutator(BaseMutator):
         existing_geo_data = None
         existing_as_data = None
-        # Check if geo data already exists in the record
+        # Check if geo data already exists in the record (ECS style)
         if "." in field_name:
             # For nested fields like destination.ip, check destination.geo and destination.as
             parent_path = field_name.rsplit(".", 1)[0]
@@ -283,10 +321,9 @@ class GeoIPLookupMutator(BaseMutator):
                 existing_geo_data = parent.get("geo")
                 existing_as_data = parent.get("as")
         else:
-            # For top-level fields, check enrichment.geo and enrichment.as
-            if "enrichment" in record and isinstance(record["enrichment"], dict):
-                existing_geo_data = record["enrichment"].get("geo")
-                existing_as_data = record["enrichment"].get("as")
+            # For top-level fields like ip, check top-level geo and as fields (ECS style)
+            existing_geo_data = record.get("geo")
+            existing_as_data = record.get("as")
         # If not forcing and geo data exists with at least country_iso_code, return existing
         if (
@@ -318,12 +355,12 @@ class GeoIPLookupMutator(BaseMutator):
         # Get GeoIP resolver and perform lookup
         try:
-            geo_config = {}
+            # Build config only if we have explicit parameters
+            # Otherwise pass None to let GeoIPResolver read environment variables
+            geo_config = None
             if self.params.get("db_path"):
-                geo_config["db_path"] = self.params["db_path"]
-            elif os.getenv("TQL_GEOIP_FULL_PATH"):
-                # Use environment variable for direct database path
-                geo_config["db_path"] = os.getenv("TQL_GEOIP_FULL_PATH")
+                geo_config = {"db_path": self.params["db_path"]}
             geo_resolver = self.get_geo_resolver(geo_config)
             # Perform lookup

tql/opensearch_components/query_converter.py CHANGED Viewed

@@ -22,7 +22,7 @@ class QueryConverter:
         self.intelligent_mappings = field_mappings
         self.simple_mappings = simple_mappings
-    def convert_node(self, node: Any) -> Dict[str, Any]:
+    def convert_node(self, node: Any) -> Dict[str, Any]:  # noqa: C901
         """Convert a single AST node to OpenSearch query fragment."""
         if isinstance(node, dict):
             node_type = node.get("type")

tql/opensearch_stats.py CHANGED Viewed

@@ -3,8 +3,8 @@
 This module translates TQL stats queries to OpenSearch aggregation DSL.
 """
-from typing import Any, Dict, List, Optional, Union
 import json
+from typing import Any, Dict, List, Optional, Union
 from .exceptions import TQLError
@@ -86,7 +86,9 @@ class OpenSearchStatsTranslator:
             # Build nested terms aggregations for grouping
             aggs_dsl = self._build_grouped_aggregations(aggregations, group_by_fields, field_mappings)
             print(
-                f"\n=== OpenSearch Aggregation Query ===\nGroup by: {group_by_fields}\nAggregation DSL: {json.dumps(aggs_dsl, indent=2)}\n"
+                f"\n=== OpenSearch Aggregation Query ===\n"
+                f"Group by: {group_by_fields}\n"
+                f"Aggregation DSL: {json.dumps(aggs_dsl, indent=2)}\n"
             )
         else:
             # Simple aggregations without grouping
@@ -186,7 +188,6 @@ class OpenSearchStatsTranslator:
         # Check for top/bottom modifiers
         order_field = None
         order_direction = "desc"
-        size = 10
         for agg in aggregations:
             if "modifier" in agg:
@@ -194,7 +195,7 @@ class OpenSearchStatsTranslator:
                 alias = agg.get("alias") or f"{agg['function']}_{agg['field']}_0"
                 order_field = alias
                 order_direction = "desc" if agg["modifier"] == "top" else "asc"
-                size = agg.get("limit", 10)
+                _size = agg.get("limit", 10)  # noqa: F841
                 break
         # Normalize group_by_fields to handle both old (string) and new (dict) formats
@@ -215,7 +216,7 @@ class OpenSearchStatsTranslator:
         current_aggs = inner_aggs
         # Process group_by fields in reverse order to build proper nesting
-        for i, field_spec in enumerate(reversed(normalized_fields)):
+        for _i, field_spec in enumerate(reversed(normalized_fields)):
             field_name = field_spec["field"]
             bucket_size = field_spec["bucket_size"]
@@ -443,7 +444,7 @@ class OpenSearchStatsTranslator:
         return result
-    def _transform_bucket_recursive(
+    def _transform_bucket_recursive(  # noqa: C901
         self,
         bucket: Dict[str, Any],
         aggregations: List[Dict[str, Any]],

tql/parser.py CHANGED Viewed

@@ -138,7 +138,7 @@ class TQLParser:
                 "Please simplify your query to reduce nesting.",
                 position=0,
                 query="",
-                suggestions=["Reduce query nesting depth", "Split into multiple simpler queries"]
+                suggestions=["Reduce query nesting depth", "Split into multiple simpler queries"],
             )
         if isinstance(parsed, list):
@@ -290,7 +290,11 @@ class TQLParser:
                         }
                 else:
                     # Fallback to treating as unary logical operator
-                    return {"type": "unary_op", "operator": first.lower(), "operand": self._build_ast(second, depth + 1)}
+                    return {
+                        "type": "unary_op",
+                        "operator": first.lower(),
+                        "operand": self._build_ast(second, depth + 1),
+                    }
             elif len(parsed) >= 3:
                 # Check if this is a field with multiple mutators
                 if isinstance(parsed[0], str) and all(
@@ -1239,7 +1243,7 @@ class TQLParser:
                 "Please simplify your query to reduce nesting.",
                 position=0,
                 query="",
-                suggestions=["Reduce query nesting depth", "Split into multiple simpler queries"]
+                suggestions=["Reduce query nesting depth", "Split into multiple simpler queries"],
             )
         if len(parsed_list) < 3:
             # Not enough elements for a chained operation

tql/post_processor.py CHANGED Viewed

@@ -137,7 +137,9 @@ class QueryPostProcessor:
                             # This is a special case: field | any/all/none eq value
                             # Safe access - both keys are guaranteed to exist by the if checks
                             array_operator = requirement.metadata["operator"]  # exists from line 128 check
-                            comparison_operator = requirement.metadata["comparison_operator"]  # exists from line 135 check
+                            comparison_operator = requirement.metadata[
+                                "comparison_operator"
+                            ]  # exists from line 135 check
                             value = requirement.metadata.get("value")
                             # Get the field value with proper nested field handling
@@ -1015,10 +1017,10 @@ class QueryPostProcessor:
             - "user.address.zip" -> "user.address.__zip_mutated__"
             - "status" -> "__status_mutated__"
         """
-        field_parts = field_name.split('.')
+        field_parts = field_name.split(".")
         if len(field_parts) > 1:
             # For nested fields, only mutate the leaf field name
-            return '.'.join(field_parts[:-1] + [f"__{field_parts[-1]}_mutated__"])
+            return ".".join(field_parts[:-1] + [f"__{field_parts[-1]}_mutated__"])
         else:
             # For flat fields, mutate the entire name
             return f"__{field_name}_mutated__"
@@ -1169,7 +1171,9 @@ class PostProcessingStats:
 class PostProcessingError(Exception):
     """Exception raised during post-processing operations."""
-    def __init__(self, message: str, field_name: Optional[str] = None, mutator_name: Optional[str] = None):
+    def __init__(  # noqa: B042
+        self, message: str, field_name: Optional[str] = None, mutator_name: Optional[str] = None
+    ):
         """Initialize post-processing error.
         Args:

tql/scripts.py CHANGED Viewed

@@ -34,7 +34,7 @@ def run_coverage():
     env = os.environ.copy()
     if "INTEGRATION_TEST_ENABLE" not in env:
         env["INTEGRATION_TEST_ENABLE"] = "false"
     # 1. Run pytest with coverage, using `src` as the source
     subprocess.run(["coverage", "run", "--source=src", "-m", "pytest"], check=True, env=env)  # nosec
@@ -50,7 +50,7 @@ def run_tests():
     env = os.environ.copy()
     if "INTEGRATION_TEST_ENABLE" not in env:
         env["INTEGRATION_TEST_ENABLE"] = "false"
     subprocess.run(["pytest", "tests"], check=True, env=env)  # nosec
@@ -99,7 +99,7 @@ def run_badge():
     env = os.environ.copy()
     if "INTEGRATION_TEST_ENABLE" not in env:
         env["INTEGRATION_TEST_ENABLE"] = "false"
     subprocess.run(  # nosec
         [
             "coverage",

tellaro-query-language 0.2.3__py3-none-any.whl → 0.2.6__py3-none-any.whl

tellaro-query-language 0.2.3py3-none-any.whl → 0.2.6py3-none-any.whl