PyPI - cudag - Versions diffs - 0.3.10__py3-none-any.whl - Mend

cudag 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

cudag/__init__.py +334 -0
cudag/annotation/__init__.py +77 -0
cudag/annotation/codegen.py +648 -0
cudag/annotation/config.py +545 -0
cudag/annotation/loader.py +342 -0
cudag/annotation/scaffold.py +121 -0
cudag/annotation/transcription.py +296 -0
cudag/cli/__init__.py +5 -0
cudag/cli/main.py +315 -0
cudag/cli/new.py +873 -0
cudag/core/__init__.py +364 -0
cudag/core/button.py +137 -0
cudag/core/canvas.py +222 -0
cudag/core/config.py +70 -0
cudag/core/coords.py +233 -0
cudag/core/data_grid.py +804 -0
cudag/core/dataset.py +678 -0
cudag/core/distribution.py +136 -0
cudag/core/drawing.py +75 -0
cudag/core/fonts.py +156 -0
cudag/core/generator.py +163 -0
cudag/core/grid.py +367 -0
cudag/core/grounding_task.py +247 -0
cudag/core/icon.py +207 -0
cudag/core/iconlist_task.py +301 -0
cudag/core/models.py +1251 -0
cudag/core/random.py +130 -0
cudag/core/renderer.py +190 -0
cudag/core/screen.py +402 -0
cudag/core/scroll_task.py +254 -0
cudag/core/scrollable_grid.py +447 -0
cudag/core/state.py +110 -0
cudag/core/task.py +293 -0
cudag/core/taskbar.py +350 -0
cudag/core/text.py +212 -0
cudag/core/utils.py +82 -0
cudag/data/surnames.txt +5000 -0
cudag/modal_apps/__init__.py +4 -0
cudag/modal_apps/archive.py +103 -0
cudag/modal_apps/extract.py +138 -0
cudag/modal_apps/preprocess.py +529 -0
cudag/modal_apps/upload.py +317 -0
cudag/prompts/SYSTEM_PROMPT.txt +104 -0
cudag/prompts/__init__.py +33 -0
cudag/prompts/system.py +43 -0
cudag/prompts/tools.py +382 -0
cudag/py.typed +0 -0
cudag/schemas/filesystem.json +90 -0
cudag/schemas/test_record.schema.json +113 -0
cudag/schemas/train_record.schema.json +90 -0
cudag/server/__init__.py +21 -0
cudag/server/app.py +232 -0
cudag/server/services/__init__.py +9 -0
cudag/server/services/generator.py +128 -0
cudag/templates/scripts/archive.sh +35 -0
cudag/templates/scripts/build.sh +13 -0
cudag/templates/scripts/extract.sh +54 -0
cudag/templates/scripts/generate.sh +116 -0
cudag/templates/scripts/pre-commit.sh +44 -0
cudag/templates/scripts/preprocess.sh +46 -0
cudag/templates/scripts/upload.sh +63 -0
cudag/templates/scripts/verify.py +428 -0
cudag/validation/__init__.py +35 -0
cudag/validation/validate.py +508 -0
cudag-0.3.10.dist-info/METADATA +570 -0
cudag-0.3.10.dist-info/RECORD +69 -0
cudag-0.3.10.dist-info/WHEEL +4 -0
cudag-0.3.10.dist-info/entry_points.txt +2 -0
cudag-0.3.10.dist-info/licenses/LICENSE +66 -0

cudag/annotation/transcription.py ADDED Viewed

@@ -0,0 +1,296 @@
+# Copyright (c) 2025 Tylt LLC. All rights reserved.
+# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
+# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
+"""HTML transcription parsing for grid elements.
+This module parses HTML table transcriptions from annotations into structured
+data that generators can use to create similar synthetic data.
+Example:
+    from cudag.annotation import parse_transcription
+    html = "<table><tr><td>10/07/2025</td><td>John</td></tr></table>"
+    table = parse_transcription(html)
+    for row in table.rows:
+        print([cell.text for cell in row.cells])
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from html.parser import HTMLParser
+from typing import Any
+@dataclass
+class TranscriptionCell:
+    """A single cell from a transcribed table."""
+    text: str
+    """Full cell text with line breaks converted to spaces."""
+    lines: list[str] = field(default_factory=list)
+    """Cell content split by <br/> tags, preserving multi-line data."""
+    @property
+    def first_line(self) -> str:
+        """Get first line of cell (useful for primary value)."""
+        return self.lines[0] if self.lines else self.text
+    @property
+    def is_empty(self) -> bool:
+        """Check if cell has no content."""
+        return not self.text.strip()
+    @property
+    def is_currency(self) -> bool:
+        """Check if cell appears to be a currency value."""
+        return bool(re.match(r"^\$?[\d,]+\.?\d*$", self.text.strip()))
+    @property
+    def is_date(self) -> bool:
+        """Check if cell appears to be a date (MM/DD/YYYY or similar)."""
+        return bool(re.match(r"^\d{1,2}/\d{1,2}/\d{2,4}$", self.text.strip()))
+    @property
+    def is_time(self) -> bool:
+        """Check if cell appears to be a time (e.g., 11:16a, 3:25p)."""
+        return bool(re.match(r"^\d{1,2}:\d{2}[ap]?m?$", self.text.strip(), re.I))
+@dataclass
+class TranscriptionRow:
+    """A single row from a transcribed table."""
+    cells: list[TranscriptionCell] = field(default_factory=list)
+    def __len__(self) -> int:
+        return len(self.cells)
+    def __getitem__(self, index: int) -> TranscriptionCell:
+        return self.cells[index]
+    def get(self, index: int, default: str = "") -> str:
+        """Get cell text by index with default."""
+        if 0 <= index < len(self.cells):
+            return self.cells[index].text
+        return default
+    @property
+    def values(self) -> list[str]:
+        """Get all cell values as strings."""
+        return [cell.text for cell in self.cells]
+@dataclass
+class ParsedTranscription:
+    """Structured data parsed from an HTML table transcription."""
+    rows: list[TranscriptionRow] = field(default_factory=list)
+    """All data rows (excludes header if detected)."""
+    headers: list[str] = field(default_factory=list)
+    """Header row values (if <thead> was present)."""
+    raw_html: str = ""
+    """Original HTML for reference."""
+    @property
+    def num_rows(self) -> int:
+        """Number of data rows."""
+        return len(self.rows)
+    @property
+    def num_cols(self) -> int:
+        """Number of columns (from first row or headers)."""
+        if self.headers:
+            return len(self.headers)
+        if self.rows:
+            return len(self.rows[0])
+        return 0
+    def column(self, index: int) -> list[str]:
+        """Get all values from a specific column."""
+        return [row.get(index) for row in self.rows]
+    def sample_values(self, col_index: int, max_samples: int = 10) -> list[str]:
+        """Get sample non-empty values from a column."""
+        values = []
+        for row in self.rows:
+            val = row.get(col_index).strip()
+            if val and val not in values:
+                values.append(val)
+                if len(values) >= max_samples:
+                    break
+        return values
+    def infer_column_types(self) -> list[str]:
+        """Infer data types for each column based on content.
+        Returns:
+            List of type hints: 'date', 'time', 'currency', 'text', 'multiline'
+        """
+        if not self.rows:
+            return []
+        types = []
+        for col_idx in range(self.num_cols):
+            # Check first few non-empty cells
+            col_type = "text"
+            for row in self.rows[:5]:
+                if col_idx >= len(row.cells):
+                    continue
+                cell = row.cells[col_idx]
+                if cell.is_empty:
+                    continue
+                if len(cell.lines) > 1:
+                    col_type = "multiline"
+                    break
+                elif cell.is_date:
+                    col_type = "date"
+                    break
+                elif cell.is_time:
+                    col_type = "time"
+                    break
+                elif cell.is_currency:
+                    col_type = "currency"
+                    break
+            types.append(col_type)
+        return types
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "headers": self.headers,
+            "rows": [
+                [{"text": c.text, "lines": c.lines} for c in row.cells]
+                for row in self.rows
+            ],
+            "num_rows": self.num_rows,
+            "num_cols": self.num_cols,
+        }
+class _TableHTMLParser(HTMLParser):
+    """Internal HTML parser for table extraction."""
+    def __init__(self) -> None:
+        super().__init__()
+        self.rows: list[TranscriptionRow] = []
+        self.headers: list[str] = []
+        self._current_row: TranscriptionRow | None = None
+        self._current_cell_lines: list[str] = []
+        self._current_cell_text: str = ""
+        self._in_thead = False
+        self._in_tbody = False
+        self._in_cell = False
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        tag = tag.lower()
+        if tag == "thead":
+            self._in_thead = True
+        elif tag == "tbody":
+            self._in_tbody = True
+        elif tag == "tr":
+            self._current_row = TranscriptionRow()
+        elif tag in ("td", "th"):
+            self._in_cell = True
+            self._current_cell_lines = []
+            self._current_cell_text = ""
+        elif tag == "br":
+            # Line break within cell - save current text as a line
+            if self._in_cell and self._current_cell_text:
+                self._current_cell_lines.append(self._current_cell_text.strip())
+                self._current_cell_text = ""
+    def handle_endtag(self, tag: str) -> None:
+        tag = tag.lower()
+        if tag == "thead":
+            self._in_thead = False
+        elif tag == "tbody":
+            self._in_tbody = False
+        elif tag == "tr":
+            if self._current_row is not None:
+                # If in thead and no headers yet, use first row as headers
+                if self._in_thead and not self.headers:
+                    self.headers = [c.text for c in self._current_row.cells]
+                else:
+                    self.rows.append(self._current_row)
+                self._current_row = None
+        elif tag in ("td", "th"):
+            if self._in_cell and self._current_row is not None:
+                # Finalize the current cell
+                if self._current_cell_text:
+                    self._current_cell_lines.append(self._current_cell_text.strip())
+                # Build cell with text and lines
+                full_text = " ".join(self._current_cell_lines)
+                cell = TranscriptionCell(
+                    text=full_text,
+                    lines=self._current_cell_lines.copy(),
+                )
+                self._current_row.cells.append(cell)
+            self._in_cell = False
+            self._current_cell_lines = []
+            self._current_cell_text = ""
+    def handle_data(self, data: str) -> None:
+        if self._in_cell:
+            self._current_cell_text += data
+def parse_transcription(html: str) -> ParsedTranscription:
+    """Parse HTML table transcription into structured data.
+    Args:
+        html: HTML string containing a <table> element
+    Returns:
+        ParsedTranscription with rows, cells, and inferred types
+    Example:
+        >>> html = "<table><tr><td>10/07/2025</td><td>$54.58</td></tr></table>"
+        >>> table = parse_transcription(html)
+        >>> table.rows[0].cells[0].text
+        '10/07/2025'
+        >>> table.rows[0].cells[0].is_date
+        True
+    """
+    if not html or not html.strip():
+        return ParsedTranscription(raw_html=html)
+    parser = _TableHTMLParser()
+    try:
+        parser.feed(html)
+    except Exception:
+        # Return empty on parse error
+        return ParsedTranscription(raw_html=html)
+    return ParsedTranscription(
+        rows=parser.rows,
+        headers=parser.headers,
+        raw_html=html,
+    )
+def parse_text_transcription(text: str) -> str:
+    """Parse plain text transcription (non-table elements).
+    For text elements, the transcription is just unstructured text.
+    This function strips whitespace and returns the clean text.
+    Args:
+        text: Raw transcription text
+    Returns:
+        Cleaned text string
+    """
+    if not text:
+        return ""
+    return text.strip()

cudag/cli/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) 2025 Tylt LLC. All rights reserved.
+# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
+# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
+"""CLI commands for CUDAG."""

cudag/cli/main.py ADDED Viewed

@@ -0,0 +1,315 @@
+# Copyright (c) 2025 Tylt LLC. All rights reserved.
+# CONFIDENTIAL AND PROPRIETARY. Unauthorized use, copying, or distribution
+# is strictly prohibited. For licensing inquiries: hello@claimhawk.app
+"""Main CLI entrypoint for CUDAG."""
+from __future__ import annotations
+from pathlib import Path
+import click
+from cudag import __version__
+@click.group()
+@click.version_option(version=__version__)
+def cli() -> None:
+    """CUDAG - ComputerUseDataAugmentedGeneration framework.
+    Create generator projects with 'cudag new', then generate datasets
+    with 'cudag generate'.
+    """
+    pass
+@cli.command()
+@click.argument("name")
+@click.option(
+    "--output-dir",
+    "-o",
+    type=click.Path(),
+    default=".",
+    help="Directory to create the project in (default: current directory)",
+)
+def new(name: str, output_dir: str) -> None:
+    """Create a new CUDAG project.
+    NAME is the project name (e.g., 'appointment-picker').
+    """
+    from cudag.cli.new import create_project
+    project_dir = create_project(name, Path(output_dir))
+    click.echo(f"Created project: {project_dir}")
+    click.echo("\nNext steps:")
+    click.echo(f"  cd {project_dir}")
+    click.echo("  # Edit screen.py, state.py, renderer.py, and tasks/")
+    click.echo("  cudag generate --config config/dataset.yaml")
+@cli.command()
+@click.option(
+    "--config",
+    "-c",
+    type=click.Path(exists=True),
+    required=True,
+    help="Path to dataset config YAML",
+)
+@click.option(
+    "--output-dir",
+    "-o",
+    type=click.Path(),
+    help="Override output directory",
+)
+def generate(config: str, output_dir: str | None) -> None:
+    """Generate a dataset from the current project.
+    Requires a dataset config file (YAML) and the project's screen/task definitions.
+    """
+    config_path = Path(config)
+    click.echo(f"Loading config: {config_path}")
+    # TODO: Implement full generation by loading project modules
+    # For now, show what would be done
+    click.echo("Generation not yet implemented - use project's generate.py directly")
+@cli.command()
+@click.argument("dataset_dir", type=click.Path(exists=True))
+def upload(dataset_dir: str) -> None:
+    """Upload a dataset to Modal volume.
+    DATASET_DIR is the path to the generated dataset directory.
+    """
+    click.echo(f"Uploading: {dataset_dir}")
+    click.echo("Upload not yet implemented")
+@cli.command()
+@click.argument("dataset_dir", type=click.Path(exists=True))
+@click.option(
+    "--verbose",
+    "-v",
+    is_flag=True,
+    help="Show all errors (default: first 10)",
+)
+def validate(dataset_dir: str, verbose: bool) -> None:
+    """Validate a dataset against the CUDAG schema.
+    DATASET_DIR is the path to the generated dataset directory.
+    Checks:
+    - Required filesystem structure (images/, test/, etc.)
+    - Training record schema (data.jsonl, train.jsonl, val.jsonl)
+    - Test record schema (test/test.json)
+    - Image path validity (all referenced images exist)
+    Exit codes:
+    - 0: Dataset is valid
+    - 1: Validation errors found
+    """
+    from cudag.validation import validate_dataset
+    dataset_path = Path(dataset_dir)
+    errors = validate_dataset(dataset_path)
+    if not errors:
+        click.secho(f"Dataset valid: {dataset_dir}", fg="green")
+        raise SystemExit(0)
+    # Show errors
+    click.secho(f"Found {len(errors)} validation error(s):", fg="red")
+    display_errors = errors if verbose else errors[:10]
+    for error in display_errors:
+        click.echo(f"  {error}")
+    if not verbose and len(errors) > 10:
+        click.echo(f"  ... and {len(errors) - 10} more (use -v to see all)")
+    raise SystemExit(1)
+@cli.group()
+def eval() -> None:
+    """Evaluation commands."""
+    pass
+@eval.command("generate")
+@click.option("--count", "-n", default=100, help="Number of eval cases")
+@click.option("--dataset-dir", type=click.Path(exists=True), help="Dataset directory")
+def eval_generate(count: int, dataset_dir: str | None) -> None:
+    """Generate evaluation cases."""
+    click.echo(f"Generating {count} eval cases")
+    click.echo("Eval generation not yet implemented")
+@eval.command("run")
+@click.option("--checkpoint", type=click.Path(exists=True), help="Model checkpoint")
+@click.option("--dataset-dir", type=click.Path(exists=True), help="Dataset directory")
+def eval_run(checkpoint: str | None, dataset_dir: str | None) -> None:
+    """Run evaluations on Modal."""
+    click.echo("Running evaluations")
+    click.echo("Eval running not yet implemented")
+@cli.command()
+def datasets() -> None:
+    """List datasets on Modal volume."""
+    click.echo("Listing datasets on Modal volume...")
+    click.echo("Dataset listing not yet implemented")
+@cli.command()
+@click.option(
+    "--host",
+    "-h",
+    default="127.0.0.1",
+    help="Host to bind to (default: 127.0.0.1)",
+)
+@click.option(
+    "--port",
+    "-p",
+    default=8420,
+    help="Port to listen on (default: 8420)",
+)
+@click.option(
+    "--reload",
+    is_flag=True,
+    help="Enable auto-reload for development",
+)
+def serve(host: str, port: int, reload: bool) -> None:
+    """Start the CUDAG server for annotation integration.
+    The server provides a REST API that the Annotator UI can use
+    to generate CUDAG projects without using the terminal.
+    Endpoints:
+      GET  /health           - Health check
+      POST /api/v1/generate  - Generate project from annotation
+      GET  /api/v1/status/{job_id} - Check generation progress
+    """
+    from cudag.server import run_server
+    click.echo(f"Starting CUDAG server on http://{host}:{port}")
+    click.echo("Press Ctrl+C to stop")
+    run_server(host=host, port=port, reload=reload)
+@cli.command("from-annotation")
+@click.argument("annotation_path", type=click.Path(exists=True))
+@click.option(
+    "--output-dir",
+    "-o",
+    type=click.Path(),
+    default=".",
+    help="Directory to create the project in (default: current directory)",
+)
+@click.option(
+    "--name",
+    "-n",
+    help="Project name (default: derived from annotation)",
+)
+@click.option(
+    "--in-place",
+    "-i",
+    is_flag=True,
+    help="Write directly to output-dir without creating a subdirectory",
+)
+def from_annotation(
+    annotation_path: str, output_dir: str, name: str | None, in_place: bool
+) -> None:
+    """Create a CUDAG project from an annotation folder or ZIP.
+    ANNOTATION_PATH is the path to an annotation folder or .zip file
+    exported from the Annotator application. The folder should contain:
+    - annotation.json: Element and task definitions
+    - original.png: Original screenshot
+    - masked.png: Screenshot with masked regions
+    - icons/: Optional folder with extracted icons
+    This generates a complete project structure with:
+    - screen.py: Screen definition with regions
+    - state.py: State class for dynamic content
+    - renderer.py: Renderer using the masked image
+    - tasks/: Task files for each defined task
+    - config/: Dataset configuration
+    - assets/: Base images and icons
+    """
+    import zipfile
+    from cudag.annotation import AnnotationLoader, scaffold_generator
+    loader = AnnotationLoader()
+    annotation_source = Path(annotation_path)
+    try:
+        parsed = loader.load(annotation_source)
+    except Exception as e:
+        click.secho(f"Error loading annotation: {e}", fg="red")
+        raise SystemExit(1)
+    project_name = name or parsed.screen_name
+    # Load images from folder or ZIP
+    if annotation_source.is_dir():
+        # Load from folder
+        original_path = annotation_source / "original.png"
+        masked_path = annotation_source / "masked.png"
+        icons_dir = annotation_source / "icons"
+        original_bytes = original_path.read_bytes() if original_path.exists() else None
+        masked_bytes = masked_path.read_bytes() if masked_path.exists() else None
+        icons: dict[str, bytes] = {}
+        if icons_dir.exists():
+            for icon_file in icons_dir.glob("*.png"):
+                icons[icon_file.stem] = icon_file.read_bytes()
+    else:
+        # Load from ZIP
+        if not annotation_source.suffix == ".zip":
+            click.secho("Error: Expected a folder or .zip file", fg="red")
+            raise SystemExit(1)
+        with zipfile.ZipFile(annotation_source) as zf:
+            original_bytes = (
+                zf.read("original.png") if "original.png" in zf.namelist() else None
+            )
+            masked_bytes = (
+                zf.read("masked.png") if "masked.png" in zf.namelist() else None
+            )
+            icons = {}
+            for filename in zf.namelist():
+                if filename.startswith("icons/") and filename.endswith(".png"):
+                    icon_name = Path(filename).stem
+                    icons[icon_name] = zf.read(filename)
+    # Scaffold project
+    output_path = Path(output_dir)
+    files = scaffold_generator(
+        name=project_name,
+        annotation=parsed,
+        output_dir=output_path,
+        original_image=original_bytes,
+        masked_image=masked_bytes,
+        icons=icons,
+        in_place=in_place,
+    )
+    project_dir = output_path if in_place else output_path / project_name
+    click.secho(f"Created project: {project_dir}", fg="green")
+    click.echo(f"\nGenerated {len(files)} files:")
+    for f in files[:10]:
+        click.echo(f"  {f.relative_to(project_dir)}")
+    if len(files) > 10:
+        click.echo(f"  ... and {len(files) - 10} more")
+    click.echo("\nNext steps:")
+    click.echo(f"  cd {project_dir}")
+    click.echo("  # Review and customize generated code")
+    click.echo("  python generator.py --samples 100")
+if __name__ == "__main__":
+    cli()