data-morph-gemma 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data_morph_gemma-0.1.0.dist-info/METADATA +177 -0
  2. data_morph_gemma-0.1.0.dist-info/RECORD +39 -0
  3. data_morph_gemma-0.1.0.dist-info/WHEEL +4 -0
  4. data_morph_gemma-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_morph_gemma-0.1.0.dist-info/licenses/LICENSE +25 -0
  6. datamorph/__init__.py +19 -0
  7. datamorph/cli.py +84 -0
  8. datamorph/convert.py +146 -0
  9. datamorph/data/__init__.py +1 -0
  10. datamorph/data/collect.py +221 -0
  11. datamorph/data/envelope.py +20 -0
  12. datamorph/data/generators/__init__.py +1 -0
  13. datamorph/data/generators/base.py +48 -0
  14. datamorph/data/generators/uc1_csv_to_json.py +64 -0
  15. datamorph/data/generators/uc2_json_to_csv.py +59 -0
  16. datamorph/data/generators/uc3_txt_log_to_csv.py +64 -0
  17. datamorph/data/generators/uc4_csv_to_txt_report.py +62 -0
  18. datamorph/data/generators/uc5_schema_migration.py +49 -0
  19. datamorph/data/sandbox.py +95 -0
  20. datamorph/data/teacher_script.py +114 -0
  21. datamorph/evaluation/__init__.py +0 -0
  22. datamorph/evaluation/metrics.py +264 -0
  23. datamorph/evaluation/output_cleanup.py +116 -0
  24. datamorph/evaluation/runner.py +218 -0
  25. datamorph/evaluation/teacher.py +193 -0
  26. datamorph/extractor/__init__.py +15 -0
  27. datamorph/extractor/base.py +26 -0
  28. datamorph/extractor/csv_extractor.py +515 -0
  29. datamorph/extractor/json_extractor.py +447 -0
  30. datamorph/extractor/json_walker.py +217 -0
  31. datamorph/extractor/sampler.py +68 -0
  32. datamorph/extractor/txt_extractor.py +199 -0
  33. datamorph/extractor/warning_rules.py +473 -0
  34. datamorph/features/__init__.py +1 -0
  35. datamorph/features/format_pairs.py +57 -0
  36. datamorph/model.py +63 -0
  37. datamorph/models/__init__.py +0 -0
  38. datamorph/models/gemma_mlx.py +163 -0
  39. datamorph/models/gemma_script_teacher.py +100 -0
@@ -0,0 +1,68 @@
1
+ """Strategic head/middle/tail sampling for tabular files.
2
+
3
+ Reads only the rows it needs via pandas `nrows` and `skiprows`. Never
4
+ loads the full file into memory.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any, cast
11
+
12
+ import pandas as pd
13
+
14
+
15
+ def sample_csv(
16
+ file_path: Path,
17
+ *,
18
+ total_rows: int,
19
+ encoding: str,
20
+ head_n: int = 3,
21
+ middle_n: int = 1,
22
+ tail_n: int = 1,
23
+ ) -> dict[str, list[dict[str, Any]]]:
24
+ """Return head/middle/tail records as a dict of three lists.
25
+
26
+ Small-file rule: if total_rows <= head_n + middle_n + tail_n, all rows
27
+ go into head and middle/tail are empty.
28
+ """
29
+ if total_rows <= 0:
30
+ return {"head": [], "middle": [], "tail": []}
31
+
32
+ def _records(df: pd.DataFrame) -> list[dict[str, Any]]:
33
+ return cast("list[dict[str, Any]]", df.to_dict("records"))
34
+
35
+ if total_rows <= head_n + middle_n + tail_n:
36
+ head = pd.read_csv(file_path, nrows=total_rows, encoding=encoding, dtype=str)
37
+ return {
38
+ "head": _records(head),
39
+ "middle": [],
40
+ "tail": [],
41
+ }
42
+
43
+ head = pd.read_csv(file_path, nrows=head_n, encoding=encoding, dtype=str)
44
+
45
+ # middle: read middle_n rows starting near the file's midpoint
46
+ middle_start = total_rows // 2
47
+ middle = pd.read_csv(
48
+ file_path,
49
+ skiprows=list(range(1, middle_start + 1)),
50
+ nrows=middle_n,
51
+ encoding=encoding,
52
+ dtype=str,
53
+ )
54
+
55
+ # tail: skip everything but the last tail_n rows
56
+ tail = pd.read_csv(
57
+ file_path,
58
+ skiprows=list(range(1, total_rows - tail_n + 1)),
59
+ nrows=tail_n,
60
+ encoding=encoding,
61
+ dtype=str,
62
+ )
63
+
64
+ return {
65
+ "head": _records(head),
66
+ "middle": _records(middle),
67
+ "tail": _records(tail),
68
+ }
@@ -0,0 +1,199 @@
1
+ """TXT metadata extractor — line-pattern inference + envelope.
2
+
3
+ CLI entry point at the bottom of the file. See the spec at
4
+ docs/superpowers/specs/2026-05-25-data-collection-pipeline-design.md.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from collections import Counter
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from .base import MetadataExtractor
15
+ from .csv_extractor import detect_encoding
16
+ from .warning_rules import (
17
+ MetadataWarning,
18
+ check_empty_file,
19
+ check_inconsistent_field_count,
20
+ check_latin1_fallback,
21
+ check_likely_timestamp_prefix,
22
+ check_mixed_line_structure,
23
+ check_no_pattern_detected,
24
+ )
25
+
26
+ PATTERN_THRESHOLD = 0.8
27
+ _DELIMITERS: tuple[str, ...] = (",", "\t", "|", ";")
28
+
29
+ # Leading timestamp: optional "[" then YYYY-MM-DD, space or T, HH:MM:SS, optional "]".
30
+ _LOG_PREFIX_RE = re.compile(r"^\[?\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}\]?")
31
+ _KEY_VALUE_RE = re.compile(r"^[\w .-]{1,40}[:=]\s*\S")
32
+
33
+
34
+ def _dominant_delimiter(lines: list[str]) -> tuple[str | None, list[int]]:
35
+ """Pick the delimiter that yields the most consistent (>=2) field counts."""
36
+ best: tuple[str | None, list[int]] = (None, [])
37
+ best_score = 0
38
+ for delim in _DELIMITERS:
39
+ counts = [len(ln.split(delim)) for ln in lines]
40
+ multi = [c for c in counts if c >= 2]
41
+ if not multi:
42
+ continue
43
+ # score = how many lines split into >=2 fields with the modal width
44
+ modal = Counter(multi).most_common(1)[0][0]
45
+ score = sum(1 for c in counts if c == modal)
46
+ if score > best_score:
47
+ best_score = score
48
+ best = (delim, counts)
49
+ return best
50
+
51
+
52
+ def infer_line_pattern(lines: list[str]) -> dict[str, Any]:
53
+ """Classify the dominant line structure of non-blank lines.
54
+
55
+ Returns a dict with at least `record_pattern` and `match_ratio`. For
56
+ `log_line` adds `pattern_regex`; for `delimited` adds `delimiter` and
57
+ `field_counts`.
58
+
59
+ The dominant pattern is reported even when its match ratio is BELOW
60
+ `PATTERN_THRESHOLD` (as long as it matched at least one line); that is what
61
+ lets `check_mixed_line_structure` fire on a partial match. Only when no
62
+ pattern matches any line is the result `freeform` with `match_ratio` 0.0.
63
+ """
64
+ if not lines:
65
+ return {"record_pattern": "freeform", "match_ratio": 0.0}
66
+ n = len(lines)
67
+
68
+ log_ratio = sum(1 for ln in lines if _LOG_PREFIX_RE.match(ln)) / n
69
+
70
+ delim, counts = _dominant_delimiter(lines)
71
+ if delim is not None:
72
+ multi = [c for c in counts if c >= 2]
73
+ modal = Counter(multi).most_common(1)[0][0]
74
+ delim_ratio = sum(1 for c in counts if c == modal) / n
75
+ else:
76
+ delim_ratio = 0.0
77
+
78
+ kv_ratio = sum(1 for ln in lines if _KEY_VALUE_RE.match(ln)) / n
79
+
80
+ # Priority on ties: log_line > delimited > key_value (max keeps the first).
81
+ candidates = [("log_line", log_ratio), ("delimited", delim_ratio), ("key_value", kv_ratio)]
82
+ best_pattern, best_ratio = max(candidates, key=lambda c: c[1])
83
+
84
+ if best_ratio <= 0.0:
85
+ return {"record_pattern": "freeform", "match_ratio": 0.0}
86
+
87
+ result: dict[str, Any] = {
88
+ "record_pattern": best_pattern,
89
+ "match_ratio": round(best_ratio, 3),
90
+ }
91
+ if best_pattern == "log_line":
92
+ result["pattern_regex"] = _LOG_PREFIX_RE.pattern
93
+ elif best_pattern == "delimited":
94
+ result["delimiter"] = delim
95
+ result["field_counts"] = counts
96
+ return result
97
+
98
+
99
+ def _read_nonblank_lines(file_path: Path, encoding: str) -> list[str]:
100
+ """Return non-blank lines with trailing newlines stripped."""
101
+ with file_path.open("r", encoding=encoding) as f:
102
+ return [ln.rstrip("\n\r") for ln in f if ln.strip()]
103
+
104
+
105
+ def sample_lines(
106
+ lines: list[str], *, head_n: int = 3, middle_n: int = 1, tail_n: int = 1
107
+ ) -> dict[str, list[str]]:
108
+ """Head/middle/tail sampling of lines with no overlap (mirrors sampler.sample_csv)."""
109
+ n = len(lines)
110
+ if n == 0:
111
+ return {"head": [], "middle": [], "tail": []}
112
+ if n <= head_n + middle_n + tail_n:
113
+ return {"head": list(lines), "middle": [], "tail": []}
114
+ head = lines[:head_n]
115
+ tail = lines[n - tail_n:] if tail_n > 0 else []
116
+ mid_start = (head_n + (n - tail_n) - middle_n) // 2
117
+ middle = lines[mid_start : mid_start + middle_n] if middle_n > 0 else []
118
+ return {"head": head, "middle": middle, "tail": tail}
119
+
120
+
121
+ class TXTExtractor(MetadataExtractor):
122
+ """Stage 1c — turns a .txt/.log file into the shared metadata envelope."""
123
+
124
+ def __init__(self, head_n: int = 3, middle_n: int = 1, tail_n: int = 1) -> None:
125
+ self.head_n = head_n
126
+ self.middle_n = middle_n
127
+ self.tail_n = tail_n
128
+
129
+ def supports(self, file_path: Path) -> bool:
130
+ return file_path.suffix.lower() in (".txt", ".log")
131
+
132
+ def extract(self, file_path: Path) -> dict[str, Any]:
133
+ warnings: list[MetadataWarning] = []
134
+ file_size = file_path.stat().st_size
135
+
136
+ encoding, attempted = detect_encoding(file_path)
137
+ _push(warnings, check_latin1_fallback(final_encoding=encoding, attempted=attempted))
138
+
139
+ lines = _read_nonblank_lines(file_path, encoding)
140
+ line_count = len(lines)
141
+ _push(warnings, check_empty_file(row_count=line_count))
142
+
143
+ pattern = infer_line_pattern(lines)
144
+ _push(warnings, check_no_pattern_detected(record_pattern=pattern["record_pattern"]))
145
+ _push(warnings, check_likely_timestamp_prefix(record_pattern=pattern["record_pattern"]))
146
+ _push(
147
+ warnings,
148
+ check_mixed_line_structure(
149
+ match_ratio=pattern.get("match_ratio", 0.0),
150
+ threshold=PATTERN_THRESHOLD,
151
+ ),
152
+ )
153
+ if pattern["record_pattern"] == "delimited":
154
+ _push(
155
+ warnings,
156
+ check_inconsistent_field_count(field_counts=pattern.get("field_counts", [])),
157
+ )
158
+
159
+ schema: dict[str, Any] = {"line_count": line_count, **pattern}
160
+ samples = sample_lines(
161
+ lines, head_n=self.head_n, middle_n=self.middle_n, tail_n=self.tail_n
162
+ )
163
+ return {
164
+ "format": "txt",
165
+ "file_path": str(file_path),
166
+ "file_size_bytes": file_size,
167
+ "encoding": encoding,
168
+ "schema_version": self.SCHEMA_VERSION,
169
+ "schema": schema,
170
+ "samples": samples,
171
+ "warnings": [w.to_dict() for w in warnings],
172
+ }
173
+
174
+
175
+ def _push(bucket: list[MetadataWarning], maybe: MetadataWarning | None) -> None:
176
+ if maybe is not None:
177
+ bucket.append(maybe)
178
+
179
+
180
+ def _main() -> int:
181
+ import argparse
182
+ import json
183
+
184
+ parser = argparse.ArgumentParser(
185
+ prog="python -m datamorph.extractor.txt_extractor",
186
+ description="Extract metadata envelope from a .txt/.log file.",
187
+ )
188
+ parser.add_argument("file", help="Path to a .txt or .log file")
189
+ args = parser.parse_args()
190
+
191
+ env = TXTExtractor().extract(Path(args.file))
192
+ text = json.dumps(env, indent=2, default=str, ensure_ascii=False)
193
+ print(text)
194
+ print(f"# rough token estimate: ~{len(text) // 4} (chars / 4)")
195
+ return 0
196
+
197
+
198
+ if __name__ == "__main__":
199
+ raise SystemExit(_main())