PyPI - datafun-streaming - Versions diffs - 0.1.0__py3-none-any.whl - Mend

datafun-streaming 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

datafun_streaming/__init__.py +1 -0
datafun_streaming/_version.py +24 -0
datafun_streaming/core/__init__.py +1 -0
datafun_streaming/core/types.py +16 -0
datafun_streaming/data_validation/__init__.py +1 -0
datafun_streaming/data_validation/errors.py +24 -0
datafun_streaming/data_validation/reference.py +63 -0
datafun_streaming/data_validation/types.py +42 -0
datafun_streaming/data_validation/validation_utils.py +143 -0
datafun_streaming/io/__init__.py +1 -0
datafun_streaming/io/errors.py +50 -0
datafun_streaming/io/io_utils.py +109 -0
datafun_streaming/kafka/__init__.py +1 -0
datafun_streaming/kafka/errors.py +150 -0
datafun_streaming/kafka/kafka_admin_utils.py +211 -0
datafun_streaming/kafka/kafka_connection_utils.py +46 -0
datafun_streaming/kafka/kafka_consumer_utils.py +62 -0
datafun_streaming/kafka/kafka_producer_utils.py +96 -0
datafun_streaming/kafka/kafka_settings.py +79 -0
datafun_streaming/py.typed +0 -0
datafun_streaming/stats/__init__.py +1 -0
datafun_streaming/stats/stats_utils.py +110 -0
datafun_streaming/storage/__init__.py +1 -0
datafun_streaming/storage/duckdb_utils.py +244 -0
datafun_streaming/visualization/__init__.py +1 -0
datafun_streaming/visualization/chart_utils.py +150 -0
datafun_streaming-0.1.0.dist-info/METADATA +168 -0
datafun_streaming-0.1.0.dist-info/RECORD +30 -0
datafun_streaming-0.1.0.dist-info/WHEEL +4 -0
datafun_streaming-0.1.0.dist-info/licenses/LICENSE +21 -0

datafun_streaming/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Shared streaming utilities for Kafka, DuckDB, validation, and visualization."""

datafun_streaming/_version.py ADDED Viewed

@@ -0,0 +1,24 @@
+# file generated by vcs-versioning
+# don't change, don't track in version control
+from __future__ import annotations
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+version: str
+__version__: str
+__version_tuple__: tuple[int | str, ...]
+version_tuple: tuple[int | str, ...]
+commit_id: str | None
+__commit_id__: str | None
+__version__ = version = '0.1.0'
+__version_tuple__ = version_tuple = (0, 1, 0)
+__commit_id__ = commit_id = None

datafun_streaming/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Shared primitives."""

datafun_streaming/core/types.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""src/datafun_streaming/core/types.py.
+Shared type aliases used across all datafun_streaming subpackages.
+Import from here when type-hinting streaming records in any module.
+"""
+__all__ = [
+    "DataRecordDict",
+    "DataRecordDictList",
+]
+# One message / row / record as a dictionary of text values.
+DataRecordDict = dict[str, str]
+# A list of messages / rows / records.
+DataRecordDictList = list[DataRecordDict]

datafun_streaming/data_validation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Data validation utilities, types, and error messages for streaming pipelines."""

datafun_streaming/data_validation/errors.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""data_validation/errors.py.
+Error messages for validation.
+"""
+def reference_validation_failed_message(*, label: str, error_count: int) -> str:
+    """Return help text when a reference data file fails validation."""
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+A reference data file failed validation.
+File:   {label}
+Errors: {error_count} problem(s) found.
+The producer cannot run until all reference files are valid.
+Fix the reference file before retrying.
+CHECK:
+1. Open data/{label} and inspect the header row.
+2. Confirm all required fields are present and spelled correctly.
+3. Confirm no rows have blank values in required fields.
+4. See data_contract_case.py for the list of required fields.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()

datafun_streaming/data_validation/reference.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""src/datafun_streaming/data_validation/reference.py.
+Reference data validation helpers.
+Provides functions for working with lookup tables:
+building lookup sets from CSV rows and validating reference records.
+"""
+# === IMPORTS ===
+from datafun_streaming.core.types import DataRecordDictList
+from datafun_streaming.data_validation.types import AllowedValuesSet
+from datafun_streaming.data_validation.validation_utils import validate_required_fields
+# === EXPORTS ===
+__all__ = [
+    "make_lookup_set",
+    "validate_reference_records",
+]
+def make_lookup_set(records: DataRecordDictList, key_field: str) -> AllowedValuesSet:
+    """Create a set of allowed values for a field in a reference table.
+    Arguments:
+        records: A list of row dictionaries from a reference CSV file.
+        key_field: The field to use as the key for allowed values.
+    Returns:
+        A set of allowed values for the specified key field.
+    """
+    values: AllowedValuesSet = set()
+    for record in records:
+        value: str = record.get(key_field, "").strip()
+        if value:
+            values.add(value)
+    return values
+def validate_reference_records(
+    *,
+    records: DataRecordDictList,
+    required_fields: list[str],
+    label: str,
+) -> list[str]:
+    """Validate reference records and return file-level errors.
+    Arguments:
+        records: Reference data records to validate.
+        required_fields: Field names required in each record.
+        label: Label for this reference file, used in error messages.
+    Returns:
+        A list of errors, or an empty list if all records are valid.
+    """
+    errors: list[str] = []
+    for record_number, record in enumerate(records, start=1):
+        for error in validate_required_fields(
+            record=record, required_fields=required_fields
+        ):
+            errors.append(f"{label} record {record_number}: {error}")
+    return errors

datafun_streaming/data_validation/types.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""src/datafun_streaming/data_validation/types.py.
+Type aliases and dataclasses for validation results.
+Import from here whenever you need to type-hint a record or validation result.
+"""
+# === IMPORTS ===
+from dataclasses import dataclass
+from datafun_streaming.core.types import DataRecordDict, DataRecordDictList
+# === EXPORTS ===
+__all__ = [
+    "DataRecordDict",
+    "DataRecordDictList",
+    "ErrorMessage",
+    "ErrorMessages",
+    "AllowedValuesSet",
+    "ValidationResult",
+]
+# === TYPE ALIASES ===
+ErrorMessage = str
+ErrorMessages = list[ErrorMessage]
+AllowedValuesSet = set[str]
+@dataclass(frozen=True)
+class ValidationResult:
+    """Result from checking one record against the data contract.
+    Attributes:
+        is_valid: True if the record passed all validation checks.
+        errors: List of error messages; empty when is_valid is True.
+    """
+    is_valid: bool
+    errors: ErrorMessages

datafun_streaming/data_validation/validation_utils.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""src/datafun_streaming/data_validation/validation_utils.py.
+Generic field-level validation functions.
+Each function checks one thing about one value and returns a list of
+error strings, empty if valid, one or more messages if not.
+These functions know nothing about domains, reference data, or business rules.
+They only check types, formats, and value ranges.
+OBS:
+  Add functions to this file as validation requirements evolve.
+"""
+# == IMPORTS ==
+from datetime import datetime
+from datafun_streaming.core.types import DataRecordDict
+from datafun_streaming.data_validation.types import ErrorMessages
+# == EXPORTS ==
+__all__ = [
+    "add_validation_errors",
+    "validate_boolean_text",
+    "validate_datetime",
+    "validate_positive_integer",
+    "validate_required_fields",
+]
+def add_validation_errors(
+    *,
+    record: DataRecordDict,
+    errors: ErrorMessages,
+) -> DataRecordDict:
+    """Return a copy of a record with validation errors attached.
+    Arguments:
+        record: A dictionary representing one data record.
+        errors: A list of validation error messages.
+    Returns:
+        A copy of the record with a validation_errors field appended.
+    """
+    output = dict(record)
+    output["validation_errors"] = " | ".join(errors)
+    return output
+def validate_boolean_text(value: str, *, field_name: str) -> list[str]:
+    """Return errors for an invalid boolean text value.
+    All boolean values must be represented as
+    "true" or "false" (case-insensitive).
+    Arguments:
+        value: The text value to validate.
+        *: All arguments after the asterisk must be passed as keyword arguments.
+        field_name: The name of the field being validated, for error messages.
+    Returns:
+        A list of errors, or an empty list if the value is valid.
+    """
+    allowed_values = {"true", "false"}
+    if value.lower() not in allowed_values:
+        return [f"{field_name} must be true or false: {value}"]
+    return []
+def validate_datetime(value: str) -> list[str]:
+    """Return errors for an invalid datetime value.
+    All datetime values must be in ISO 8601 format.
+    Arguments:
+        value: The text value to validate.
+    Returns:
+        A list of errors, or an empty list if the value is valid.
+    """
+    try:
+        datetime.fromisoformat(value.replace("Z", "+00:00"))
+    except ValueError:
+        return [f"Invalid datetime: {value}"]
+    return []
+def validate_positive_integer(value: str) -> list[str]:
+    """Return errors for an invalid positive integer value.
+    All positive integer values must be integers greater than or equal to 1.
+    Arguments:
+        value: The text value to validate.
+    Returns:
+        A list of errors, or an empty list if the value is valid.
+    """
+    try:
+        number = int(value)
+    except ValueError:
+        return [f"Value must be an integer: {value}"]
+    if number < 1:
+        return [f"Value must be at least 1: {value}"]
+    return []
+# === DEFINE FIELD VALIDATION HELPERS ===
+def validate_required_fields(
+    *,
+    record: DataRecordDict,
+    required_fields: list[str],
+) -> list[str]:
+    """Return errors for missing or blank required fields.
+    All required fields must be present and not blank.
+    Arguments:
+        record: A dictionary representing one data record / row.
+        *: All arguments after the asterisk must be passed as keyword arguments.
+        required_fields: A list of field names that are required.
+    Returns:
+        A list of errors, or
+        an empty list if all required fields are present.
+    """
+    errors: list[str] = []
+    for field_name in required_fields:
+        if field_name not in record:
+            errors.append(f"Missing required field: {field_name}")
+        elif not record[field_name].strip():
+            errors.append(f"Required field is blank: {field_name}")
+    return errors

datafun_streaming/io/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """File I/O utilities for reading and writing data formats."""

datafun_streaming/io/errors.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""io/errors.py."""
+# === EXPORTS ===
+__all__ = [
+    "missing_csv_file_message",
+    "missing_csv_field_message",
+]
+# === DEFINE HELPER FUNCTIONS ===
+def missing_csv_file_message(*, path: str) -> str:
+    """Return help text for a missing CSV file."""
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+This project needs a CSV file to generate messages.
+Required CSV file not found:
+    {path}
+CHECK:
+1. Confirm you are running the command from the project root folder.
+2. Confirm the data folder exists.
+3. Confirm data/sales.csv exists.
+4. If the file was deleted, restore it from the repository.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()
+def missing_csv_field_message(*, field: str, available_fields: list[str]) -> str:
+    """Return help text for a missing CSV field."""
+    fields = ", ".join(available_fields)
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+The project read the CSV file,
+but an expected column was not present.
+Required CSV field missing:
+    {field}
+Available fields were:
+    {fields}
+CHECK:
+1. Open data/sales.csv.
+2. Confirm the header row includes: {field}
+3. Header names must match exactly.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()

datafun_streaming/io/io_utils.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""src/datafun_streaming/io/io_utils.py.
+CSV and JSON helpers for streaming examples.
+"""
+# === IMPORTS ===
+import csv
+import json
+from pathlib import Path
+from typing import Any
+from datafun_streaming.io.errors import missing_csv_file_message
+# === EXPORTS ===
+__all__ = [
+    "append_csv_row",
+    "format_message_for_log",
+    "read_csv_as_lookup",
+    "read_csv_rows",
+    "row_to_json",
+    "row_from_json",
+]
+# === DEFINE HELPER FUNCTIONS ===
+def append_csv_row(path: Path, row: dict[str, Any], fieldnames: list[str]) -> None:
+    """Append one row to a CSV file, writing the header first if needed."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    file_exists = path.exists()
+    with path.open(mode="a", encoding="utf-8", newline="") as file:
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(row)
+def format_message_for_log(message: dict[str, Any]) -> str:
+    """Format one message dictionary for readable log output."""
+    lines = ["{"]
+    for key, value in message.items():
+        lines.append(f"  {key}: {value}")
+    lines.append("}")
+    return "\n".join(lines)
+def read_csv_as_lookup(
+    path: Path,
+    *,
+    key_field: str,
+    value_field: str,
+) -> dict[str, Any]:
+    """Read a CSV file into a key-value lookup dictionary.
+    Arguments:
+        path:        Path to the CSV file.
+        key_field:   The column to use as the dictionary key.
+        value_field: The column to use as the dictionary value.
+    Returns:
+        A dict mapping each key_field value to its value_field value.
+    Example:
+        region_lookup = read_csv_as_lookup(
+            REGIONS_CSV, key_field="region_id", value_field="tax_rate_pct"
+        )
+        tax_rate = float(region_lookup["US-MO"]) / 100.0
+    """
+    rows = read_csv_rows(path)
+    return {row[key_field]: row[value_field] for row in rows}
+def read_csv_rows(path: Path) -> list[dict[str, str]]:
+    """Read a CSV file into a list of string dictionaries."""
+    if not path.exists():
+        msg = missing_csv_file_message(path=path.as_posix())
+        raise FileNotFoundError(msg)
+    with path.open(mode="r", encoding="utf-8", newline="") as file:
+        reader = csv.DictReader(file)
+        if reader.fieldnames is None:
+            msg = f"CSV file has no header row: {path.as_posix()}"
+            raise ValueError(msg)
+        return list(reader)
+def row_to_json(row: dict[str, Any]) -> str:
+    """Convert a row dictionary to compact JSON text."""
+    return json.dumps(row, sort_keys=True, separators=(",", ":"))
+def row_from_json(text: str) -> dict[str, Any]:
+    """Convert JSON text to a row dictionary."""
+    value = json.loads(text)
+    if not isinstance(value, dict):
+        msg = "Expected JSON object."
+        raise ValueError(msg)
+    return value

datafun_streaming/kafka/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Kafka producer, consumer, admin, and connection utilities."""

datafun_streaming/kafka/errors.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""src/datafun_streaming/kafka/errors.py.
+Error messages for Kafka.
+"""
+# === EXPORTS
+__all__ = [
+    "kafka_admin_failed_message",
+    "kafka_consume_failed_message",
+    "kafka_delivery_failed_message",
+    "kafka_no_messages_message",
+    "kafka_not_reachable_message",
+    "kafka_topic_empty_message",
+    "kafka_topic_not_found_message",
+]
+# === DEFINE HELPER FUNCTIONS ===
+def kafka_admin_failed_message(*, operation: str, topic: str, detail: str) -> str:
+    """Return help text for a failed Kafka admin operation."""
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+A Kafka admin operation failed.
+Operation: {operation}
+Topic:     {topic}
+Details:
+    {detail}
+CHECK:
+1. Confirm Kafka is running. Follow ref_START_KAFKA.md.
+2. Confirm you have permission to {operation} topics.
+3. Try the operation manually from the CLI:
+   cd ~/kafka
+   bin/kafka-topics.sh --list --bootstrap-server localhost:9092
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()
+def kafka_consume_failed_message(*, detail: str) -> str:
+    """Return help text for a Kafka consume failure."""
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+The consumer ran, but Kafka did not return a usable message.
+Kafka reported an error while consuming a message.
+Details:
+    {detail}
+CHECK:
+1. Confirm Kafka is running.
+2. Confirm the topic exists. Follow MANAGE_TOPIC.md.
+3. Run the producer again if the topic has no messages.
+4. If you already consumed these messages,
+   set a different KAFKA_GROUP_ID in .env.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()
+def kafka_delivery_failed_message(*, detail: str) -> str:
+    """Return help text for a Kafka delivery failure."""
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+The message was generated, but Kafka did not accept it.
+Kafka did not confirm message delivery.
+Details:
+    {detail}
+CHECK:
+1. Confirm Kafka is running.
+2. Confirm the topic exists.
+3. Confirm the broker is reachable at localhost:9092.
+4. Try MANAGE_TOPIC.md to verify Kafka independently of Python.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()
+def kafka_no_messages_message() -> str:
+    """Return help text when no Kafka messages are consumed."""
+    return """
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+Kafka may be reachable, but no unread messages
+were available for this consumer.
+No message received before timeout.
+CHECK:
+1. Confirm Kafka is running.
+2. Confirm the topic exists. Follow MANAGE_TOPIC.md.
+3. Run the producer in another project terminal.
+4. If this consumer group already read the messages,
+   set a different KAFKA_GROUP_ID in .env.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()
+def kafka_not_reachable_message(*, bootstrap_servers: str) -> str:
+    """Return help text for a Kafka connection failure."""
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+Python code is running,
+but Kafka is not available.
+Kafka is not reachable.
+The program tried to connect to:
+    KAFKA_BOOTSTRAP_SERVERS = {bootstrap_servers}
+CHECK:
+1. Start Kafka first. Follow START_KAFKA.md.
+2. Verify Kafka is running. In a terminal, run:
+   cd ~/kafka
+   bin/kafka-topics.sh --list --bootstrap-server localhost:9092
+3. Verify the topic exists. Follow MANAGE_TOPIC.md.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()
+def kafka_topic_empty_message(*, topic: str) -> str:
+    """Return help text when a Kafka topic exists but has no messages."""
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+The topic exists but contains no messages.
+Topic is empty:
+    KAFKA_TOPIC = {topic}
+CHECK:
+1. Run the producer first to send messages to this topic.
+2. If you already ran the producer, confirm it completed successfully.
+3. If messages were consumed by another consumer group,
+   run the producer again to repopulate the topic.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()
+def kafka_topic_not_found_message(*, topic: str, bootstrap_servers: str) -> str:
+    """Return help text when a required Kafka topic does not exist."""
+    return f"""
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+The topic does not exist in Kafka.
+Topic not found:
+    KAFKA_TOPIC = {topic}
+    KAFKA_BOOTSTRAP_SERVERS = {bootstrap_servers}
+CHECK:
+1. Create the topic first. Follow ref_MANAGE_TOPIC.md.
+   cd ~/kafka
+   bin/kafka-topics.sh --create --topic {topic} --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1
+2. Confirm the topic was created:
+   bin/kafka-topics.sh --list --bootstrap-server localhost:9092
+3. Confirm KAFKA_TOPIC in .env matches the topic you created.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+""".strip()