PyPI - data-morph-gemma - Versions diffs - 0.1.0__py3-none-any.whl - Mend

data-morph-gemma 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data_morph_gemma-0.1.0.dist-info/METADATA +177 -0
data_morph_gemma-0.1.0.dist-info/RECORD +39 -0
data_morph_gemma-0.1.0.dist-info/WHEEL +4 -0
data_morph_gemma-0.1.0.dist-info/entry_points.txt +2 -0
data_morph_gemma-0.1.0.dist-info/licenses/LICENSE +25 -0
datamorph/__init__.py +19 -0
datamorph/cli.py +84 -0
datamorph/convert.py +146 -0
datamorph/data/__init__.py +1 -0
datamorph/data/collect.py +221 -0
datamorph/data/envelope.py +20 -0
datamorph/data/generators/__init__.py +1 -0
datamorph/data/generators/base.py +48 -0
datamorph/data/generators/uc1_csv_to_json.py +64 -0
datamorph/data/generators/uc2_json_to_csv.py +59 -0
datamorph/data/generators/uc3_txt_log_to_csv.py +64 -0
datamorph/data/generators/uc4_csv_to_txt_report.py +62 -0
datamorph/data/generators/uc5_schema_migration.py +49 -0
datamorph/data/sandbox.py +95 -0
datamorph/data/teacher_script.py +114 -0
datamorph/evaluation/__init__.py +0 -0
datamorph/evaluation/metrics.py +264 -0
datamorph/evaluation/output_cleanup.py +116 -0
datamorph/evaluation/runner.py +218 -0
datamorph/evaluation/teacher.py +193 -0
datamorph/extractor/__init__.py +15 -0
datamorph/extractor/base.py +26 -0
datamorph/extractor/csv_extractor.py +515 -0
datamorph/extractor/json_extractor.py +447 -0
datamorph/extractor/json_walker.py +217 -0
datamorph/extractor/sampler.py +68 -0
datamorph/extractor/txt_extractor.py +199 -0
datamorph/extractor/warning_rules.py +473 -0
datamorph/features/__init__.py +1 -0
datamorph/features/format_pairs.py +57 -0
datamorph/model.py +63 -0
datamorph/models/__init__.py +0 -0
datamorph/models/gemma_mlx.py +163 -0
datamorph/models/gemma_script_teacher.py +100 -0

datamorph/data/teacher_script.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Stage 3 — Claude Opus writes a conversion script from a metadata envelope.
+Mirrors datamorph/evaluation/teacher.py::_call_opus (same `claude -p` invocation), but
+the model returns <analysis> + <script> rather than a converted file. The live
+call is exercised only by opt-in tests; parsing/prompt building are pure.
+"""
+from __future__ import annotations
+import json
+import re
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+SKILL_REL_PATH = "skills/script_generation_teacher.md"
+_ANALYSIS_RE = re.compile(r"<analysis>(.*?)</analysis>", re.DOTALL)
+_SCRIPT_RE = re.compile(r"<script>(.*?)</script>", re.DOTALL)
+_FENCE_RE = re.compile(r"^```(?:python|py)?\s*\n(.*?)\n```$", re.DOTALL)
+@dataclass
+class ScriptResult:
+    analysis: str
+    script: str
+    raw_output: str
+    returncode: int
+    stderr: str
+    raw_payload: dict
+    @property
+    def ok(self) -> bool:
+        return self.returncode == 0 and bool(self.script)
+def _strip_fence(text: str) -> str:
+    stripped = text.strip()
+    m = _FENCE_RE.match(stripped)
+    return m.group(1).strip() if m else stripped
+def parse_teacher_output(text: str) -> tuple[str, str]:
+    """Return (analysis, script). Script has any wrapping ```fence``` removed."""
+    a = _ANALYSIS_RE.search(text)
+    s = _SCRIPT_RE.search(text)
+    analysis = a.group(1).strip() if a else ""
+    script = _strip_fence(s.group(1)) if s else ""
+    return analysis, script
+def build_script_prompt(
+    envelope: dict[str, Any],
+    instruction: str,
+    output_format: str,
+    feedback: str | None = None,
+) -> str:
+    env_json = json.dumps(envelope, indent=2, default=str)
+    fb = (
+        f"\n\nYour previous attempt failed: {feedback}\n"
+        f"Write a corrected <analysis> + <script>.\n"
+        if feedback
+        else ""
+    )
+    return (
+        f"Read the instructions in {SKILL_REL_PATH}, then write a Python conversion script.\n\n"
+        f"You are given the METADATA ENVELOPE of a source file (not the file itself):\n"
+        f"```json\n{env_json}\n```\n\n"
+        f"Task: {instruction}\n"
+        f"Target output format: {output_format.upper()}.\n\n"
+        f"The script must read the input file path from sys.argv[1] and write the converted "
+        f"output to sys.argv[2], using only the Python standard library and pandas. Respond "
+        f"with exactly an <analysis>...</analysis> block followed by a <script>...</script> "
+        f"block. No prose, no code fences outside the script tags."
+        f"{fb}"
+    )
+def call_script_teacher(
+    envelope: dict[str, Any],
+    instruction: str,
+    output_format: str,
+    *,
+    timeout: int = 240,
+    feedback: str | None = None,
+) -> ScriptResult:
+    """Run `claude -p --model opus` and parse <analysis> + <script> from the result."""
+    prompt = build_script_prompt(envelope, instruction, output_format, feedback)
+    cmd = [
+        "claude", "-p", prompt,
+        "--model", "opus",
+        "--output-format", "json",
+        "--allowedTools", "Read",
+    ]
+    proc = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+        cwd=str(PROJECT_ROOT),
+        timeout=timeout,
+        encoding="utf-8",
+        errors="replace",
+    )
+    if proc.returncode != 0:
+        return ScriptResult("", "", "", proc.returncode, proc.stderr or "", {})
+    try:
+        payload = json.loads(proc.stdout)
+    except json.JSONDecodeError as e:
+        return ScriptResult("", "", "", -1, f"decode error: {e}", {"stdout_head": proc.stdout[:500]})
+    raw = payload.get("result", "") or ""
+    analysis, script = parse_teacher_output(raw)
+    return ScriptResult(analysis, script, raw, 0, proc.stderr or "", payload)

datamorph/evaluation/__init__.py ADDED Viewed

File without changes

datamorph/evaluation/metrics.py ADDED Viewed

@@ -0,0 +1,264 @@
+from __future__ import annotations
+import csv
+import io
+import json
+from typing import Any, Iterable
+# 1. Format validity
+def format_validity(output: str, output_format: str) -> float:
+    """Return 1.0 if output parses as the target format, else 0.0."""
+    fmt = output_format.lower()
+    if fmt == "json":
+        try:
+            json.loads(output)
+            return 1.0
+        except (json.JSONDecodeError, ValueError):
+            return 0.0
+    if fmt == "csv":
+        try:
+            reader = csv.reader(io.StringIO(output))
+            rows = list(reader)
+            if not rows:
+                return 0.0
+            width = len(rows[0])
+            if width == 0:
+                return 0.0
+            # every row must have the same column count
+            if not all(len(r) == width for r in rows):
+                return 0.0
+            return 1.0
+        except csv.Error:
+            return 0.0
+    if fmt == "txt":
+        return 1.0 if output.strip() else 0.0
+    raise ValueError(f"Unknown output_format: {output_format!r}")
+# 2. Schema compliance
+def _json_key_skeleton(obj: Any) -> Any:
+    """Recursively reduce a JSON value to its structural skeleton.
+    Dicts -> sorted tuple of (key, child_skeleton).
+    Lists -> ('list', child_skeleton_of_first) so we check the per-element shape
+             rather than length (caller decides whether length matters).
+    Scalars -> the type name.
+    """
+    if isinstance(obj, dict):
+        return tuple(sorted((k, _json_key_skeleton(v)) for k, v in obj.items()))
+    if isinstance(obj, list):
+        if not obj:
+            return ("list", "empty")
+        # Use first element's skeleton as representative; a well-formed output
+        # should have homogeneous elements in each array position.
+        return ("list", _json_key_skeleton(obj[0]))
+    return type(obj).__name__
+def schema_compliance(actual: str, expected: str, output_format: str) -> float:
+    """Return 1.0 if actual's structural skeleton matches expected's, else 0.0."""
+    fmt = output_format.lower()
+    if fmt == "json":
+        try:
+            a = json.loads(actual)
+            e = json.loads(expected)
+        except (json.JSONDecodeError, ValueError):
+            return 0.0
+        return 1.0 if _json_key_skeleton(a) == _json_key_skeleton(e) else 0.0
+    if fmt == "csv":
+        try:
+            a_rows = list(csv.reader(io.StringIO(actual)))
+            e_rows = list(csv.reader(io.StringIO(expected)))
+        except csv.Error:
+            return 0.0
+        if not a_rows or not e_rows:
+            return 0.0
+        # header match (case-insensitive, trimmed)
+        a_hdr = [c.strip().lower() for c in a_rows[0]]
+        e_hdr = [c.strip().lower() for c in e_rows[0]]
+        return 1.0 if a_hdr == e_hdr else 0.0
+    if fmt == "txt":
+        # No meaningful structural check for freeform TXT.
+        return 1.0
+    raise ValueError(f"Unknown output_format: {output_format!r}")
+# 3. Loadability
+def loadability(output: str, output_format: str) -> float:
+    """Return 1.0 if pandas can load the output without error, else 0.0."""
+    fmt = output_format.lower()
+    if fmt == "json":
+        try:
+            import pandas as pd
+            data = json.loads(output)
+            # pd.json_normalize handles both lists-of-objects and nested dicts.
+            if isinstance(data, list):
+                pd.json_normalize(data)
+            elif isinstance(data, dict):
+                # normalize the first list-valued field if present, else wrap.
+                list_fields = [v for v in data.values() if isinstance(v, list)]
+                if list_fields:
+                    pd.json_normalize(list_fields[0])
+                else:
+                    pd.json_normalize([data])
+            else:
+                return 0.0
+            return 1.0
+        except Exception:
+            return 0.0
+    if fmt == "csv":
+        try:
+            import pandas as pd
+            df = pd.read_csv(io.StringIO(output))
+            return 1.0 if len(df.columns) > 0 else 0.0
+        except Exception:
+            return 0.0
+    if fmt == "txt":
+        return 1.0 if output.strip() else 0.0
+    raise ValueError(f"Unknown output_format: {output_format!r}")
+# 4. Content accuracy
+def _values_equal(a: Any, b: Any) -> bool:
+    """Compare two scalar values with light coercion.
+    - Numeric strings compare equal to numbers: "9.99" == 9.99.
+    - None == "". ("null" is handled by JSON already being None.)
+    - Strings compare case-sensitive after .strip().
+    """
+    if a == b:
+        return True
+    # Both numeric (possibly as strings)?
+    try:
+        fa, fb = float(a), float(b)
+        if fa == fb:
+            return True
+    except (TypeError, ValueError):
+        pass
+    # Both empty-ish?
+    if (a is None or a == "") and (b is None or b == ""):
+        return True
+    # String comparison with whitespace strip
+    if isinstance(a, str) and isinstance(b, str):
+        return a.strip() == b.strip()
+    return False
+def _walk_json_leaves(obj: Any, path: str = "") -> Iterable[tuple[str, Any]]:
+    """Yield (key_path, leaf_value) pairs from a JSON-decoded object."""
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            new_path = f"{path}.{k}" if path else k
+            yield from _walk_json_leaves(v, new_path)
+    elif isinstance(obj, list):
+        for i, v in enumerate(obj):
+            new_path = f"{path}[{i}]"
+            yield from _walk_json_leaves(v, new_path)
+    else:
+        yield path, obj
+def _json_content_accuracy(actual_text: str, expected_text: str) -> float:
+    """Fraction of expected leaf paths that match actual."""
+    try:
+        actual = json.loads(actual_text)
+        expected = json.loads(expected_text)
+    except (json.JSONDecodeError, ValueError):
+        return 0.0
+    actual_map = dict(_walk_json_leaves(actual))
+    expected_map = dict(_walk_json_leaves(expected))
+    if not expected_map:
+        return 0.0
+    matches = sum(
+        1
+        for path, ev in expected_map.items()
+        if path in actual_map and _values_equal(actual_map[path], ev)
+    )
+    return matches / len(expected_map)
+def _csv_content_accuracy(actual_text: str, expected_text: str) -> float:
+    """Fraction of expected cells that match actual (by header-aware row alignment).
+    Rows are aligned positionally; cells are compared by shared column name.
+    If the header differs, score is 0.0 (that's a schema-compliance issue).
+    """
+    try:
+        a_rows = list(csv.reader(io.StringIO(actual_text)))
+        e_rows = list(csv.reader(io.StringIO(expected_text)))
+    except csv.Error:
+        return 0.0
+    if len(a_rows) < 1 or len(e_rows) < 1:
+        return 0.0
+    a_hdr = [c.strip() for c in a_rows[0]]
+    e_hdr = [c.strip() for c in e_rows[0]]
+    if [h.lower() for h in a_hdr] != [h.lower() for h in e_hdr]:
+        return 0.0
+    a_data, e_data = a_rows[1:], e_rows[1:]
+    total = len(e_data) * len(e_hdr)
+    if total == 0:
+        return 0.0
+    matches = 0
+    for i, e_row in enumerate(e_data):
+        a_row = a_data[i] if i < len(a_data) else [""] * len(e_hdr)
+        for j, e_cell in enumerate(e_row):
+            a_cell = a_row[j] if j < len(a_row) else ""
+            if _values_equal(a_cell.strip(), e_cell.strip()):
+                matches += 1
+    return matches / total
+def _txt_content_accuracy(actual_text: str, required_substrings: list[str]) -> float:
+    """Fraction of required substrings present in actual (case-insensitive)."""
+    if not required_substrings:
+        return 0.0
+    hay = actual_text.lower()
+    hits = sum(1 for s in required_substrings if s.lower() in hay)
+    return hits / len(required_substrings)
+def content_accuracy(
+    actual: str,
+    expected: str,
+    output_format: str,
+    required_substrings: list[str] | None = None,
+) -> float:
+    """Dispatch to the format-appropriate content-accuracy routine."""
+    fmt = output_format.lower()
+    if fmt == "json":
+        return _json_content_accuracy(actual, expected)
+    if fmt == "csv":
+        return _csv_content_accuracy(actual, expected)
+    if fmt == "txt":
+        return _txt_content_accuracy(actual, required_substrings or [])
+    raise ValueError(f"Unknown output_format: {output_format!r}")
+# Aggregate helper
+def score_all(
+    actual: str,
+    expected: str,
+    output_format: str,
+    required_substrings: list[str] | None = None,
+) -> dict[str, float]:
+    """Run all four metrics and return a dict of scores."""
+    return {
+        "format_validity": format_validity(actual, output_format),
+        "schema_compliance": schema_compliance(actual, expected, output_format),
+        "loadability": loadability(actual, output_format),
+        "content_accuracy": content_accuracy(
+            actual, expected, output_format, required_substrings
+        ),
+    }

datamorph/evaluation/output_cleanup.py ADDED Viewed

@@ -0,0 +1,116 @@
+from __future__ import annotations
+import re
+def clean_model_output(raw: str, output_format: str) -> tuple[str, list[str]]:
+    applied: list[str] = []
+    text = raw
+    stripped = text.strip()
+    if stripped != text:
+        applied.append("strip_whitespace")
+    text = stripped
+    fenced = _try_strip_code_fence(text)
+    if fenced is not None:
+        text = fenced
+        applied.append("strip_code_fence")
+    pre_stripped = _try_strip_preamble(text, output_format)
+    if pre_stripped is not None:
+        text = pre_stripped
+        applied.append("strip_preamble")
+    if output_format == "json":
+        trailing_stripped = _try_strip_trailing_prose_json(text)
+        if trailing_stripped is not None:
+            text = trailing_stripped
+            applied.append("strip_trailing_prose")
+    final = text.strip()
+    if final != text and "strip_whitespace" not in applied:
+        applied.append("strip_whitespace")
+    return final, applied
+_FENCE_OPEN = re.compile(r"^```([A-Za-z0-9_+\-]*)\s*\n", re.MULTILINE)
+_FENCE_CLOSE = "\n```"
+def _try_strip_code_fence(text: str) -> str | None:
+    m = _FENCE_OPEN.match(text)
+    if not m:
+        return None
+    body_start = m.end()
+    close_idx = text.find(_FENCE_CLOSE, body_start)
+    if close_idx == -1:
+        return None  # unclosed — skip
+    return text[body_start:close_idx]
+def _try_strip_preamble(text: str, output_format: str) -> str | None:
+    if output_format == "txt":
+        return None
+    lines = text.split("\n")
+    if output_format == "json":
+        for i, line in enumerate(lines):
+            s = line.lstrip()
+            if s.startswith("{") or s.startswith("["):
+                if i == 0:
+                    return None
+                return "\n".join(lines[i:])
+        return None
+    if output_format == "csv":
+        for i, line in enumerate(lines):
+            if "," in line:
+                if i == 0:
+                    return None
+                return "\n".join(lines[i:])
+        return None
+    return None
+def _try_strip_trailing_prose_json(text: str) -> str | None:
+    # Find first opening bracket
+    start = -1
+    open_ch = ""
+    for i, c in enumerate(text):
+        if c == "{" or c == "[":
+            start = i
+            open_ch = c
+            break
+    if start == -1:
+        return None
+    close_ch = "}" if open_ch == "{" else "]"
+    depth = 0
+    in_string = False
+    escape = False
+    end = -1
+    for i in range(start, len(text)):
+        c = text[i]
+        if escape:
+            escape = False
+            continue
+        if c == "\\":
+            escape = True
+            continue
+        if c == '"':
+            in_string = not in_string
+            continue
+        if in_string:
+            continue
+        if c == open_ch:
+            depth += 1
+        elif c == close_ch:
+            depth -= 1
+            if depth == 0:
+                end = i + 1
+                break
+    if end == -1:
+        return None
+    if end == len(text):
+        return None  # nothing to strip
+    if text[end:].strip() == "":
+        return None
+    return text[:end]