PyPI - qtype - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

qtype 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (272) hide show

qtype/interpreter/converters.py CHANGED Viewed

@@ -2,37 +2,16 @@
 from __future__ import annotations
-from collections.abc import AsyncIterator
-from typing import Any, cast
+from pathlib import Path
+from typing import Any
+import fsspec
 import pandas as pd
 from pydantic import BaseModel
 from qtype.interpreter.types import FlowMessage, Session
-from qtype.semantic.model import Flow
-async def dataframe_to_flow_messages(
-    df: pd.DataFrame, session: Session
-) -> AsyncIterator[FlowMessage]:
-    """
-    Convert a DataFrame to an async generator of FlowMessages.
-    Each row in the DataFrame becomes a FlowMessage with the same session.
-    Args:
-        df: DataFrame where each row represents one set of inputs
-        session: Session object to use for all messages
-    Yields:
-        FlowMessages, one per DataFrame row
-    """
-    # Use to_dict with orient='records' - much faster than iterrows
-    # This returns a list of dicts directly without Series overhead
-    records = cast(list[dict[str, Any]], df.to_dict(orient="records"))
-    for record in records:
-        yield FlowMessage(session=session, variables=record)
+from qtype.interpreter.typing import convert_dict_to_typed_variables
+from qtype.semantic.model import Flow, Variable
 def flow_messages_to_dataframe(
@@ -77,3 +56,140 @@ def flow_messages_to_dataframe(
         results.append(row_data)
     return pd.DataFrame(results)
+def read_dataframe_from_file(
+    file_path: str,
+) -> pd.DataFrame:
+    """
+    Read a file into a pandas DataFrame.
+    Automatically detects file format based on MIME type and supports both
+    local and remote files via fsspec. Returns raw DataFrame without type
+    conversion.
+    Args:
+        file_path: Path to the file (local or remote, e.g., s3://bucket/file)
+    Returns:
+        DataFrame with data from the file
+    Raises:
+        ValueError: If file format is not supported or mime type detection fails
+        FileNotFoundError: If file does not exist
+    Supported formats:
+        - CSV (.csv)
+        - JSON (.json)
+        - JSONL (.jsonl, JSON Lines)
+        - Parquet (.parquet)
+        - Excel (.xlsx, .xls)
+    Examples:
+        >>> # Read CSV
+        >>> df = read_dataframe_from_file("data.csv")
+        >>>
+        >>> # Read from S3
+        >>> df = read_dataframe_from_file("s3://bucket/data.parquet")
+    """
+    import magic
+    ext_to_mime = {
+        ".csv": "text/csv",
+        ".json": "application/json",
+        ".jsonl": "application/jsonlines",
+        ".parquet": "application/vnd.parquet",
+        ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        ".xls": "application/vnd.ms-excel",
+    }
+    # Detect MIME type - handle both local and remote files
+    # For remote files, we'll need to download a sample first
+    if file_path.startswith(("http://", "https://", "s3://", "gs://")):
+        # For remote files, infer from extension as fallback
+        extension = Path(file_path).suffix.lower()
+        # Map extensions to mime types
+        mime_type = ext_to_mime.get(extension, "application/octet-stream")
+    else:
+        # Local file - use magic to detect mime type
+        try:
+            mime_type = magic.Magic(mime=True).from_file(file_path)
+        except Exception as e:
+            # Fallback to extension-based detection
+            extension = Path(file_path).suffix.lower()
+            mime_type = ext_to_mime.get(extension, "application/octet-stream")
+            if mime_type == "application/octet-stream":
+                raise ValueError(
+                    f"Could not determine file type for {file_path}: {e}"
+                )
+    # Open file with fsspec (supports local and remote files)
+    with fsspec.open(file_path, "rb") as file_handle:
+        # Read based on MIME type
+        if mime_type == "text/csv" or mime_type == "text/plain":
+            df = pd.read_csv(file_handle, keep_default_na=False)  # type: ignore[arg-type]
+        elif mime_type in ["application/json", "application/jsonlines"]:
+            # Check if it's JSONL by extension
+            if Path(file_path).suffix.lower() == ".jsonl":
+                df = pd.read_json(
+                    file_handle,  # type: ignore[arg-type]
+                    lines=True,
+                )
+            else:
+                df = pd.read_json(file_handle)  # type: ignore[arg-type]
+        elif mime_type in [
+            "application/vnd.parquet",
+            "application/octet-stream",
+        ]:
+            # Parquet is often detected as octet-stream
+            df = pd.read_parquet(file_handle)  # type: ignore[arg-type]
+        elif mime_type in [
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            "application/vnd.ms-excel",
+        ]:
+            df = pd.read_excel(file_handle)  # type: ignore[arg-type]
+        else:
+            raise ValueError(
+                f"Unsupported MIME type for file {file_path}: {mime_type}"
+            )
+    return df
+def dataframe_to_flow_messages(
+    df: pd.DataFrame,
+    variables: list[Variable],
+    session: Session = Session(session_id="default"),
+) -> list[FlowMessage]:
+    """
+    Convert a DataFrame to FlowMessages with type conversion.
+    Each row in the DataFrame becomes a FlowMessage with variables converted
+    to their proper types based on the Variable definitions.
+    Args:
+        df: DataFrame with raw data
+        variables: List of Variable definitions for type conversion
+        session: Session to use for all FlowMessages (default: Session(session_id="default"))
+    Returns:
+        List of FlowMessages, one per row, with typed variables
+    Examples:
+        >>> from qtype.semantic.model import Variable
+        >>> from qtype.base.types import PrimitiveTypeEnum
+        >>> import pandas as pd
+        >>>
+        >>> df = pd.DataFrame({"age": ["30"], "score": ["95.5"]})
+        >>> vars = [
+        ...     Variable(id="age", type=PrimitiveTypeEnum.int),
+        ...     Variable(id="score", type=PrimitiveTypeEnum.float),
+        ... ]
+        >>> messages = dataframe_to_flow_messages(df, vars)
+    """
+    messages = []
+    for row_dict in df.to_dict(orient="records"):
+        typed_vars = convert_dict_to_typed_variables(row_dict, variables)
+        messages.append(FlowMessage(session=session, variables=typed_vars))
+    return messages

qtype/interpreter/executors/agent_executor.py CHANGED Viewed

@@ -112,9 +112,8 @@ class AgentExecutor(StepExecutor, ToolExecutionMixin, FunctionToolHelper):
         # Convert input variables to chat messages
         inputs = []
         for input_var in self.step.inputs:
-            value = message.variables.get(input_var.id)
-            if value and isinstance(value, ChatMessage):
-                inputs.append(to_chat_message(value))
+            value = message.get_variable(input_var.id)
+            inputs.append(to_chat_message(value))
         # Get session ID for memory isolation
         session_id = message.session.session_id

qtype/interpreter/executors/aggregate_executor.py CHANGED Viewed

@@ -11,10 +11,9 @@ class AggregateExecutor(BatchedStepExecutor):
     """
     Executor for the Aggregate step.
-    This is a terminal, many-to-one operation that reduces an entire stream
-    to a single summary message containing counts of successful and failed
-    messages. It processes all messages without modification during the
-    processing phase, then emits a single aggregate summary during finalization.
+    A step that, after all messages have been processed,
+    returns a single message containing the counts of successful and failed
+    messages. Other messages are passed through unchanged.
     """
     def __init__(

qtype/interpreter/executors/bedrock_reranker_executor.py CHANGED Viewed

@@ -2,7 +2,6 @@
 from __future__ import annotations
-import asyncio
 import logging
 from typing import AsyncIterator
@@ -56,10 +55,10 @@ class BedrockRerankerExecutor(StepExecutor):
                 )
                 return
-            # Get session for region info
+            # Get region from auth or default session
             if self.step.auth is not None:
-                with aws(self.step.auth, self.context.secret_manager) as s:
-                    region_name = s.region_name
+                with aws(self.step.auth, self.context.secret_manager) as creds:
+                    region_name = creds.region_name
             else:
                 import boto3
@@ -120,31 +119,21 @@ class BedrockRerankerExecutor(StepExecutor):
                 },
             }
-            def _call_bedrock_rerank():
-                """Create client and call rerank in executor thread."""
-                if self.step.auth is not None:
-                    with aws(self.step.auth, self.context.secret_manager) as s:
-                        client = s.client("bedrock-agent-runtime")
-                        return client.rerank(
-                            queries=queries,
-                            sources=documents,
-                            rerankingConfiguration=reranking_configuration,
-                        )
-                else:
-                    import boto3
-                    session = boto3.Session()
-                    client = session.client("bedrock-agent-runtime")
-                    return client.rerank(
-                        queries=queries,
-                        sources=documents,
-                        rerankingConfiguration=reranking_configuration,
-                    )
+            # Create async bedrock client and call rerank
+            import aioboto3
-            loop = asyncio.get_running_loop()
-            response = await loop.run_in_executor(
-                self.context.thread_pool, _call_bedrock_rerank
-            )
+            creds_kwargs = {}
+            if self.step.auth is not None:
+                with aws(self.step.auth, self.context.secret_manager) as creds:
+                    creds_kwargs = creds.as_kwargs()
+            session = aioboto3.Session(**creds_kwargs)
+            async with session.client("bedrock-agent-runtime") as client:
+                response = await client.rerank(
+                    queries=queries,
+                    sources=documents,
+                    rerankingConfiguration=reranking_configuration,
+                )
             results = []
             for d in response["results"]:

qtype/interpreter/executors/construct_executor.py CHANGED Viewed

@@ -4,7 +4,7 @@ from qtype.dsl.model import ListType
 from qtype.interpreter.base.base_step_executor import StepExecutor
 from qtype.interpreter.base.executor_context import ExecutorContext
 from qtype.interpreter.types import FlowMessage
-from qtype.interpreter.typing import instantiate_variable
+from qtype.interpreter.typing import convert_dict_to_typed_variables
 from qtype.semantic.model import Construct
@@ -43,26 +43,26 @@ class ConstructExecutor(StepExecutor):
                 isinstance(output_var.type, ListType)
                 or len(self.step.inputs) == 1
             ):
-                inputs = message.variables[self.step.inputs[0].id]
+                # Single input: pass value directly
+                data = {
+                    output_var.id: message.variables[self.step.inputs[0].id]
+                }
             elif hasattr(output_var.type, "model_validate"):
                 # This is a custom type (Pydantic model)
-                # So input should be a dict
-                input_values = {
-                    input_var.id: message.variables[input_var.id]
-                    for input_var in self.step.inputs
-                }
-                # use the mapping to convert variable names to
-                inputs = {
-                    self.step.field_mapping.get(var_name, var_name): value  # type: ignore[attr-defined]
-                    for var_name, value in input_values.items()
+                # field_bindings maps type field names to Variables
+                data = {
+                    output_var.id: {
+                        field_name: message.variables[var.id]
+                        for field_name, var in self.step.field_bindings.items()
+                    }
                 }
             else:
                 raise ValueError(
                     "Construct step must have either a single input or output of a custom type."
                 )
-            constructed_value = instantiate_variable(output_var, inputs)
-            yield message.copy_with_variables(
-                {output_var.id: constructed_value}
-            )
+            # Use convert_dict_to_typed_variables to validate and convert
+            result = convert_dict_to_typed_variables(data, self.step.outputs)
+            yield message.copy_with_variables(result)
         except Exception as e:
             yield message.copy_with_error(self.step.id, e)

qtype/interpreter/executors/doc_to_text_executor.py CHANGED Viewed

@@ -49,9 +49,7 @@ class DocToTextConverterExecutor(StepExecutor):
         try:
             # Get the input document
-            if input_id not in message.variables:
-                raise ValueError(f"Input variable '{input_id}' is missing")
-            doc = message.variables.get(input_id)
+            doc = message.get_variable(input_id)
             if not isinstance(doc, RAGDocument):
                 raise ValueError(
                     f"Input variable '{input_id}' must be a RAGDocument"

qtype/interpreter/executors/document_embedder_executor.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 import logging
 from typing import AsyncIterator
@@ -60,17 +59,7 @@ class DocumentEmbedderExecutor(StepExecutor):
         Returns:
             The embedding vector as a list of floats.
         """
-        # TODO: switch back to async once aws auth supports it.
-        # https://github.com/bazaarvoice/qtype/issues/108
-        def _call():
-            return self.embedding_model.get_text_embedding(text=text)
-        loop = asyncio.get_running_loop()
-        response = await loop.run_in_executor(self.context.thread_pool, _call)
-        return response
-        # return await self.embedding_model.aget_text_embedding(text=text)
+        return await self.embedding_model.aget_text_embedding(text=text)
     async def process_message(
         self,

qtype/interpreter/executors/field_extractor_executor.py CHANGED Viewed

@@ -111,15 +111,11 @@ class FieldExtractorExecutor(StepExecutor):
             Multiple messages may be yielded if JSONPath matches multiple values.
         """
         input_id = self.step.inputs[0].id
-        output_id = self.step.outputs[0].id
+        output_var = self.step.outputs[0]
         try:
             # Get the input value
-            input_value = message.variables.get(input_id)
-            if input_value is None:
-                raise ValueError(
-                    f"Input variable '{input_id}' is not set or is None"
-                )
+            input_value = message.get_variable(input_id)
             await self.stream_emitter.status(
                 f"Extracting fields using JSONPath: {self.step.json_path}"
@@ -132,17 +128,20 @@ class FieldExtractorExecutor(StepExecutor):
             matches = self.jsonpath_expr.find(input_dict)
             if not matches:
-                if self.step.fail_on_missing:
+                if output_var.optional:
+                    # Yield message with None output
+                    await self.stream_emitter.status(
+                        "JSONPath matched 0 value(s)"
+                    )
+                    yield message.copy_with_variables({output_var.id: None})
+                    return
+                else:
                     raise ValueError(
                         (
                             f"JSONPath expression '{self.step.json_path}' "
                             f"did not match any data in input"
                         )
                     )
-                else:
-                    # Yield message with None output
-                    yield message.copy_with_variables({output_id: None})
-                    return
             await self.stream_emitter.status(
                 f"JSONPath matched {len(matches)} value(s)"
@@ -156,7 +155,9 @@ class FieldExtractorExecutor(StepExecutor):
                 output_value = self._construct_output(extracted_data)
                 # Yield message with the constructed output
-                yield message.copy_with_variables({output_id: output_value})
+                yield message.copy_with_variables(
+                    {output_var.id: output_value}
+                )
         except Exception as e:
             # Emit error event to stream so frontend can display it

qtype/interpreter/executors/file_source_executor.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from __future__ import annotations
-from pathlib import Path
 from typing import AsyncIterator
-import fsspec
-import pandas as pd
 from qtype.interpreter.base.base_step_executor import StepExecutor
 from qtype.interpreter.base.executor_context import ExecutorContext
+from qtype.interpreter.converters import (
+    dataframe_to_flow_messages,
+    read_dataframe_from_file,
+)
 from qtype.interpreter.types import FlowMessage
 from qtype.semantic.model import ConstantPath, FileSource
@@ -37,8 +37,6 @@ class FileSourceExecutor(StepExecutor):
         Yields:
             FlowMessages with the results of processing.
         """
-        output_columns = {output.id for output in self.step.outputs}
         # get the path
         if isinstance(self.step.path, ConstantPath):  # type: ignore[attr-defined]
             file_path = self.step.path  # type: ignore[attr-defined]
@@ -55,30 +53,16 @@ class FileSourceExecutor(StepExecutor):
                 f"Reading file from path: {file_path}"
             )
-        # Determine file format from extension
+        # Get file path as string
         file_path_str = (
             file_path.uri if isinstance(file_path, ConstantPath) else file_path
         )
-        extension = Path(file_path_str).suffix.lower()
-        # Use fsspec to open the file and read with pandas
-        with fsspec.open(file_path_str, "rb") as file_handle:
-            if extension == ".csv":
-                df = pd.read_csv(file_handle)  # type: ignore[arg-type]
-            elif extension == ".parquet":
-                df = pd.read_parquet(file_handle)  # type: ignore[arg-type]
-            elif extension == ".json":
-                df = pd.read_json(file_handle)  # type: ignore[arg-type]
-            elif extension == ".jsonl":
-                df = pd.read_json(
-                    file_handle,
-                    lines=True,  # type: ignore[arg-type]
-                )
-            else:
-                # Default to parquet if no extension or unknown
-                df = pd.read_parquet(file_handle)  # type: ignore[arg-type]
+        # Read file into DataFrame using helper function
+        df = read_dataframe_from_file(file_path_str)
-        # confirm the outputs exist in the dataframe
+        # Validate that expected output columns are present
+        output_columns = {output.id for output in self.step.outputs}
         columns = set(df.columns)
         missing_columns = output_columns - columns
         if missing_columns:
@@ -90,12 +74,15 @@ class FileSourceExecutor(StepExecutor):
                 )
             )
-        for row in df.to_dict(orient="records"):
-            # Filter to only the expected output columns if they exist
-            row = {
-                str(k): v for k, v in row.items() if str(k) in output_columns
-            }
-            yield message.copy_with_variables(new_variables=row)
+        # Convert DataFrame to FlowMessages with type conversion
+        flow_messages = dataframe_to_flow_messages(
+            df, self.step.outputs, session=message.session
+        )
+        # Yield each message
+        for flow_message in flow_messages:
+            yield flow_message
         await self.stream_emitter.status(
             f"Emitted {len(df)} rows from: {file_path_str}"
         )

qtype/interpreter/executors/invoke_embedding_executor.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 from typing import AsyncIterator
 from openinference.semconv.trace import OpenInferenceSpanKindValues
@@ -52,46 +51,34 @@ class InvokeEmbeddingExecutor(StepExecutor):
         try:
             # Get the input value
-            input_value = message.variables.get(input_id)
+            input_value = message.get_variable(input_id)
-            if input_value is None:
-                raise ValueError(f"Input variable '{input_id}' is missing")
-            def _call(input_value=input_value):
-                # Generate embedding based on input type
-                if input_type == PrimitiveTypeEnum.text:
-                    if not isinstance(input_value, str):
-                        input_value = str(input_value)
-                    vector = self.embedding_model.get_text_embedding(
-                        text=input_value
-                    )
-                    content = input_value
-                elif input_type == PrimitiveTypeEnum.image:
-                    # For image embeddings
-                    vector = self.embedding_model.get_image_embedding(
-                        image_path=input_value
-                    )
-                    content = input_value
-                else:
-                    raise ValueError(
-                        (
-                            f"Unsupported input type for embedding: "
-                            f"{input_type}. Must be 'text' or 'image'."
-                        )
+            # Generate embedding based on input type
+            if input_type == PrimitiveTypeEnum.text:
+                if not isinstance(input_value, str):
+                    input_value = str(input_value)
+                vector = await self.embedding_model.aget_text_embedding(
+                    text=input_value
+                )
+                content = input_value
+            elif input_type == PrimitiveTypeEnum.image:
+                # For image embeddings
+                vector = await self.embedding_model.aget_image_embedding(
+                    image_path=input_value
+                )
+                content = input_value
+            else:
+                raise ValueError(
+                    (
+                        f"Unsupported input type for embedding: "
+                        f"{input_type}. Must be 'text' or 'image'."
                     )
-                # Create the Embedding object
-                embedding = Embedding(
-                    vector=vector,
-                    content=content,
                 )
-                return embedding
-            # TODO: switch back to async once aws auth supports it.
-            # https://github.com/bazaarvoice/qtype/issues/108
-            loop = asyncio.get_running_loop()
-            embedding = await loop.run_in_executor(
-                self.context.thread_pool, _call
+            # Create the Embedding object
+            embedding = Embedding(
+                vector=vector,
+                content=content,
             )
             # Yield the result

qtype/interpreter/executors/invoke_flow_executor.py CHANGED Viewed

@@ -34,7 +34,7 @@ class InvokeFlowExecutor(StepExecutor):
         initial = message.copy_with_variables(
             {
                 id: message.variables.get(var.id)
-                for var, id in self.step.input_bindings.items()
+                for id, var in self.step.input_bindings.items()
             }
         )
         # Pass through context (already available as self.context)
@@ -46,6 +46,6 @@ class InvokeFlowExecutor(StepExecutor):
             yield msg.copy_with_variables(
                 {
                     var.id: msg.variables.get(id)
-                    for var, id in self.step.output_bindings.items()
+                    for id, var in self.step.output_bindings.items()
                 }
             )

qtype/interpreter/executors/invoke_tool_executor.py CHANGED Viewed

@@ -247,28 +247,26 @@ class InvokeToolExecutor(StepExecutor, ToolExecutionMixin):
         """
         tool_inputs = {}
-        for tool_param_name, step_var_id in self.step.input_bindings.items():
+        for tool_param_name, step_variable in self.step.input_bindings.items():
             # Get tool parameter definition
-            tool_param = self.step.tool.inputs.get(tool_param_name)
+            tool_param = next(
+                (p for p in self.step.tool.inputs if p.id == tool_param_name),
+                None,
+            )
             if not tool_param:
                 raise ValueError(
                     f"Tool parameter '{tool_param_name}' not defined in tool"
                 )
             # Get value from message variables
-            value = message.variables.get(step_var_id)
-            # Handle missing values
-            if value is None:
-                if not tool_param.optional:
-                    raise ValueError(
-                        (
-                            f"Required input '{step_var_id}' for tool "
-                            f"parameter '{tool_param_name}' is missing"
-                        )
-                    )
-                # Skip optional parameters that are missing
-                continue
+            # Use default=None for optional params, let get_variable raise for required
+            if tool_param.optional:
+                value = message.get_variable(step_variable.id, default=None)
+                if value is None:
+                    # Skip optional parameters that are unset
+                    continue
+            else:
+                value = message.get_variable(step_variable.id)
             tool_inputs[tool_param_name] = value
@@ -288,9 +286,12 @@ class InvokeToolExecutor(StepExecutor, ToolExecutionMixin):
         """
         output_vars = {}
-        for tool_param_name, step_var_id in self.step.output_bindings.items():
+        for tool_param_name, step_var in self.step.output_bindings.items():
             # Get tool parameter definition
-            tool_param = self.step.tool.outputs.get(tool_param_name)
+            tool_param = next(
+                (p for p in self.step.tool.outputs if p.id == tool_param_name),
+                None,
+            )
             if not tool_param:
                 raise ValueError(
                     f"Tool parameter '{tool_param_name}' not defined in tool"
@@ -311,7 +312,7 @@ class InvokeToolExecutor(StepExecutor, ToolExecutionMixin):
                 value = result
             if value is not None:
-                output_vars[step_var_id] = value
+                output_vars[step_var.id] = value
         return output_vars

qtype 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

qtype 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl