data-morph-gemma 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_morph_gemma-0.1.0.dist-info/METADATA +177 -0
- data_morph_gemma-0.1.0.dist-info/RECORD +39 -0
- data_morph_gemma-0.1.0.dist-info/WHEEL +4 -0
- data_morph_gemma-0.1.0.dist-info/entry_points.txt +2 -0
- data_morph_gemma-0.1.0.dist-info/licenses/LICENSE +25 -0
- datamorph/__init__.py +19 -0
- datamorph/cli.py +84 -0
- datamorph/convert.py +146 -0
- datamorph/data/__init__.py +1 -0
- datamorph/data/collect.py +221 -0
- datamorph/data/envelope.py +20 -0
- datamorph/data/generators/__init__.py +1 -0
- datamorph/data/generators/base.py +48 -0
- datamorph/data/generators/uc1_csv_to_json.py +64 -0
- datamorph/data/generators/uc2_json_to_csv.py +59 -0
- datamorph/data/generators/uc3_txt_log_to_csv.py +64 -0
- datamorph/data/generators/uc4_csv_to_txt_report.py +62 -0
- datamorph/data/generators/uc5_schema_migration.py +49 -0
- datamorph/data/sandbox.py +95 -0
- datamorph/data/teacher_script.py +114 -0
- datamorph/evaluation/__init__.py +0 -0
- datamorph/evaluation/metrics.py +264 -0
- datamorph/evaluation/output_cleanup.py +116 -0
- datamorph/evaluation/runner.py +218 -0
- datamorph/evaluation/teacher.py +193 -0
- datamorph/extractor/__init__.py +15 -0
- datamorph/extractor/base.py +26 -0
- datamorph/extractor/csv_extractor.py +515 -0
- datamorph/extractor/json_extractor.py +447 -0
- datamorph/extractor/json_walker.py +217 -0
- datamorph/extractor/sampler.py +68 -0
- datamorph/extractor/txt_extractor.py +199 -0
- datamorph/extractor/warning_rules.py +473 -0
- datamorph/features/__init__.py +1 -0
- datamorph/features/format_pairs.py +57 -0
- datamorph/model.py +63 -0
- datamorph/models/__init__.py +0 -0
- datamorph/models/gemma_mlx.py +163 -0
- datamorph/models/gemma_script_teacher.py +100 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Strategic head/middle/tail sampling for tabular files.
|
|
2
|
+
|
|
3
|
+
Reads only the rows it needs via pandas `nrows` and `skiprows`. Never
|
|
4
|
+
loads the full file into memory.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, cast
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def sample_csv(
|
|
16
|
+
file_path: Path,
|
|
17
|
+
*,
|
|
18
|
+
total_rows: int,
|
|
19
|
+
encoding: str,
|
|
20
|
+
head_n: int = 3,
|
|
21
|
+
middle_n: int = 1,
|
|
22
|
+
tail_n: int = 1,
|
|
23
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
24
|
+
"""Return head/middle/tail records as a dict of three lists.
|
|
25
|
+
|
|
26
|
+
Small-file rule: if total_rows <= head_n + middle_n + tail_n, all rows
|
|
27
|
+
go into head and middle/tail are empty.
|
|
28
|
+
"""
|
|
29
|
+
if total_rows <= 0:
|
|
30
|
+
return {"head": [], "middle": [], "tail": []}
|
|
31
|
+
|
|
32
|
+
def _records(df: pd.DataFrame) -> list[dict[str, Any]]:
|
|
33
|
+
return cast("list[dict[str, Any]]", df.to_dict("records"))
|
|
34
|
+
|
|
35
|
+
if total_rows <= head_n + middle_n + tail_n:
|
|
36
|
+
head = pd.read_csv(file_path, nrows=total_rows, encoding=encoding, dtype=str)
|
|
37
|
+
return {
|
|
38
|
+
"head": _records(head),
|
|
39
|
+
"middle": [],
|
|
40
|
+
"tail": [],
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
head = pd.read_csv(file_path, nrows=head_n, encoding=encoding, dtype=str)
|
|
44
|
+
|
|
45
|
+
# middle: read middle_n rows starting near the file's midpoint
|
|
46
|
+
middle_start = total_rows // 2
|
|
47
|
+
middle = pd.read_csv(
|
|
48
|
+
file_path,
|
|
49
|
+
skiprows=list(range(1, middle_start + 1)),
|
|
50
|
+
nrows=middle_n,
|
|
51
|
+
encoding=encoding,
|
|
52
|
+
dtype=str,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# tail: skip everything but the last tail_n rows
|
|
56
|
+
tail = pd.read_csv(
|
|
57
|
+
file_path,
|
|
58
|
+
skiprows=list(range(1, total_rows - tail_n + 1)),
|
|
59
|
+
nrows=tail_n,
|
|
60
|
+
encoding=encoding,
|
|
61
|
+
dtype=str,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
"head": _records(head),
|
|
66
|
+
"middle": _records(middle),
|
|
67
|
+
"tail": _records(tail),
|
|
68
|
+
}
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""TXT metadata extractor — line-pattern inference + envelope.
|
|
2
|
+
|
|
3
|
+
CLI entry point at the bottom of the file. See the spec at
|
|
4
|
+
docs/superpowers/specs/2026-05-25-data-collection-pipeline-design.md.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from collections import Counter
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from .base import MetadataExtractor
|
|
15
|
+
from .csv_extractor import detect_encoding
|
|
16
|
+
from .warning_rules import (
|
|
17
|
+
MetadataWarning,
|
|
18
|
+
check_empty_file,
|
|
19
|
+
check_inconsistent_field_count,
|
|
20
|
+
check_latin1_fallback,
|
|
21
|
+
check_likely_timestamp_prefix,
|
|
22
|
+
check_mixed_line_structure,
|
|
23
|
+
check_no_pattern_detected,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
PATTERN_THRESHOLD = 0.8
|
|
27
|
+
_DELIMITERS: tuple[str, ...] = (",", "\t", "|", ";")
|
|
28
|
+
|
|
29
|
+
# Leading timestamp: optional "[" then YYYY-MM-DD, space or T, HH:MM:SS, optional "]".
|
|
30
|
+
_LOG_PREFIX_RE = re.compile(r"^\[?\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}\]?")
|
|
31
|
+
_KEY_VALUE_RE = re.compile(r"^[\w .-]{1,40}[:=]\s*\S")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _dominant_delimiter(lines: list[str]) -> tuple[str | None, list[int]]:
|
|
35
|
+
"""Pick the delimiter that yields the most consistent (>=2) field counts."""
|
|
36
|
+
best: tuple[str | None, list[int]] = (None, [])
|
|
37
|
+
best_score = 0
|
|
38
|
+
for delim in _DELIMITERS:
|
|
39
|
+
counts = [len(ln.split(delim)) for ln in lines]
|
|
40
|
+
multi = [c for c in counts if c >= 2]
|
|
41
|
+
if not multi:
|
|
42
|
+
continue
|
|
43
|
+
# score = how many lines split into >=2 fields with the modal width
|
|
44
|
+
modal = Counter(multi).most_common(1)[0][0]
|
|
45
|
+
score = sum(1 for c in counts if c == modal)
|
|
46
|
+
if score > best_score:
|
|
47
|
+
best_score = score
|
|
48
|
+
best = (delim, counts)
|
|
49
|
+
return best
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def infer_line_pattern(lines: list[str]) -> dict[str, Any]:
|
|
53
|
+
"""Classify the dominant line structure of non-blank lines.
|
|
54
|
+
|
|
55
|
+
Returns a dict with at least `record_pattern` and `match_ratio`. For
|
|
56
|
+
`log_line` adds `pattern_regex`; for `delimited` adds `delimiter` and
|
|
57
|
+
`field_counts`.
|
|
58
|
+
|
|
59
|
+
The dominant pattern is reported even when its match ratio is BELOW
|
|
60
|
+
`PATTERN_THRESHOLD` (as long as it matched at least one line); that is what
|
|
61
|
+
lets `check_mixed_line_structure` fire on a partial match. Only when no
|
|
62
|
+
pattern matches any line is the result `freeform` with `match_ratio` 0.0.
|
|
63
|
+
"""
|
|
64
|
+
if not lines:
|
|
65
|
+
return {"record_pattern": "freeform", "match_ratio": 0.0}
|
|
66
|
+
n = len(lines)
|
|
67
|
+
|
|
68
|
+
log_ratio = sum(1 for ln in lines if _LOG_PREFIX_RE.match(ln)) / n
|
|
69
|
+
|
|
70
|
+
delim, counts = _dominant_delimiter(lines)
|
|
71
|
+
if delim is not None:
|
|
72
|
+
multi = [c for c in counts if c >= 2]
|
|
73
|
+
modal = Counter(multi).most_common(1)[0][0]
|
|
74
|
+
delim_ratio = sum(1 for c in counts if c == modal) / n
|
|
75
|
+
else:
|
|
76
|
+
delim_ratio = 0.0
|
|
77
|
+
|
|
78
|
+
kv_ratio = sum(1 for ln in lines if _KEY_VALUE_RE.match(ln)) / n
|
|
79
|
+
|
|
80
|
+
# Priority on ties: log_line > delimited > key_value (max keeps the first).
|
|
81
|
+
candidates = [("log_line", log_ratio), ("delimited", delim_ratio), ("key_value", kv_ratio)]
|
|
82
|
+
best_pattern, best_ratio = max(candidates, key=lambda c: c[1])
|
|
83
|
+
|
|
84
|
+
if best_ratio <= 0.0:
|
|
85
|
+
return {"record_pattern": "freeform", "match_ratio": 0.0}
|
|
86
|
+
|
|
87
|
+
result: dict[str, Any] = {
|
|
88
|
+
"record_pattern": best_pattern,
|
|
89
|
+
"match_ratio": round(best_ratio, 3),
|
|
90
|
+
}
|
|
91
|
+
if best_pattern == "log_line":
|
|
92
|
+
result["pattern_regex"] = _LOG_PREFIX_RE.pattern
|
|
93
|
+
elif best_pattern == "delimited":
|
|
94
|
+
result["delimiter"] = delim
|
|
95
|
+
result["field_counts"] = counts
|
|
96
|
+
return result
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _read_nonblank_lines(file_path: Path, encoding: str) -> list[str]:
|
|
100
|
+
"""Return non-blank lines with trailing newlines stripped."""
|
|
101
|
+
with file_path.open("r", encoding=encoding) as f:
|
|
102
|
+
return [ln.rstrip("\n\r") for ln in f if ln.strip()]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def sample_lines(
|
|
106
|
+
lines: list[str], *, head_n: int = 3, middle_n: int = 1, tail_n: int = 1
|
|
107
|
+
) -> dict[str, list[str]]:
|
|
108
|
+
"""Head/middle/tail sampling of lines with no overlap (mirrors sampler.sample_csv)."""
|
|
109
|
+
n = len(lines)
|
|
110
|
+
if n == 0:
|
|
111
|
+
return {"head": [], "middle": [], "tail": []}
|
|
112
|
+
if n <= head_n + middle_n + tail_n:
|
|
113
|
+
return {"head": list(lines), "middle": [], "tail": []}
|
|
114
|
+
head = lines[:head_n]
|
|
115
|
+
tail = lines[n - tail_n:] if tail_n > 0 else []
|
|
116
|
+
mid_start = (head_n + (n - tail_n) - middle_n) // 2
|
|
117
|
+
middle = lines[mid_start : mid_start + middle_n] if middle_n > 0 else []
|
|
118
|
+
return {"head": head, "middle": middle, "tail": tail}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class TXTExtractor(MetadataExtractor):
|
|
122
|
+
"""Stage 1c — turns a .txt/.log file into the shared metadata envelope."""
|
|
123
|
+
|
|
124
|
+
def __init__(self, head_n: int = 3, middle_n: int = 1, tail_n: int = 1) -> None:
|
|
125
|
+
self.head_n = head_n
|
|
126
|
+
self.middle_n = middle_n
|
|
127
|
+
self.tail_n = tail_n
|
|
128
|
+
|
|
129
|
+
def supports(self, file_path: Path) -> bool:
|
|
130
|
+
return file_path.suffix.lower() in (".txt", ".log")
|
|
131
|
+
|
|
132
|
+
def extract(self, file_path: Path) -> dict[str, Any]:
|
|
133
|
+
warnings: list[MetadataWarning] = []
|
|
134
|
+
file_size = file_path.stat().st_size
|
|
135
|
+
|
|
136
|
+
encoding, attempted = detect_encoding(file_path)
|
|
137
|
+
_push(warnings, check_latin1_fallback(final_encoding=encoding, attempted=attempted))
|
|
138
|
+
|
|
139
|
+
lines = _read_nonblank_lines(file_path, encoding)
|
|
140
|
+
line_count = len(lines)
|
|
141
|
+
_push(warnings, check_empty_file(row_count=line_count))
|
|
142
|
+
|
|
143
|
+
pattern = infer_line_pattern(lines)
|
|
144
|
+
_push(warnings, check_no_pattern_detected(record_pattern=pattern["record_pattern"]))
|
|
145
|
+
_push(warnings, check_likely_timestamp_prefix(record_pattern=pattern["record_pattern"]))
|
|
146
|
+
_push(
|
|
147
|
+
warnings,
|
|
148
|
+
check_mixed_line_structure(
|
|
149
|
+
match_ratio=pattern.get("match_ratio", 0.0),
|
|
150
|
+
threshold=PATTERN_THRESHOLD,
|
|
151
|
+
),
|
|
152
|
+
)
|
|
153
|
+
if pattern["record_pattern"] == "delimited":
|
|
154
|
+
_push(
|
|
155
|
+
warnings,
|
|
156
|
+
check_inconsistent_field_count(field_counts=pattern.get("field_counts", [])),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
schema: dict[str, Any] = {"line_count": line_count, **pattern}
|
|
160
|
+
samples = sample_lines(
|
|
161
|
+
lines, head_n=self.head_n, middle_n=self.middle_n, tail_n=self.tail_n
|
|
162
|
+
)
|
|
163
|
+
return {
|
|
164
|
+
"format": "txt",
|
|
165
|
+
"file_path": str(file_path),
|
|
166
|
+
"file_size_bytes": file_size,
|
|
167
|
+
"encoding": encoding,
|
|
168
|
+
"schema_version": self.SCHEMA_VERSION,
|
|
169
|
+
"schema": schema,
|
|
170
|
+
"samples": samples,
|
|
171
|
+
"warnings": [w.to_dict() for w in warnings],
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _push(bucket: list[MetadataWarning], maybe: MetadataWarning | None) -> None:
|
|
176
|
+
if maybe is not None:
|
|
177
|
+
bucket.append(maybe)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _main() -> int:
|
|
181
|
+
import argparse
|
|
182
|
+
import json
|
|
183
|
+
|
|
184
|
+
parser = argparse.ArgumentParser(
|
|
185
|
+
prog="python -m datamorph.extractor.txt_extractor",
|
|
186
|
+
description="Extract metadata envelope from a .txt/.log file.",
|
|
187
|
+
)
|
|
188
|
+
parser.add_argument("file", help="Path to a .txt or .log file")
|
|
189
|
+
args = parser.parse_args()
|
|
190
|
+
|
|
191
|
+
env = TXTExtractor().extract(Path(args.file))
|
|
192
|
+
text = json.dumps(env, indent=2, default=str, ensure_ascii=False)
|
|
193
|
+
print(text)
|
|
194
|
+
print(f"# rough token estimate: ~{len(text) // 4} (chars / 4)")
|
|
195
|
+
return 0
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
if __name__ == "__main__":
|
|
199
|
+
raise SystemExit(_main())
|