firecloud-devnet 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fc_mlops/__init__.py +3 -0
- fc_mlops/__main__.py +5 -0
- fc_mlops/anomaly.py +112 -0
- fc_mlops/artifact_store.py +111 -0
- fc_mlops/cli.py +190 -0
- fc_mlops/simulate_failure.py +100 -0
- fc_mlops/telemetry.py +72 -0
- fc_rag/__init__.py +3 -0
- fc_rag/cli.py +51 -0
- fc_rag/config.py +24 -0
- fc_rag/embedder.py +62 -0
- fc_rag/indexer.py +121 -0
- fc_rag/query_engine.py +79 -0
- fc_rag/requirements.txt +6 -0
- fc_rag/retriever.py +46 -0
- firecloud/__init__.py +17 -0
- firecloud/chunker.py +122 -0
- firecloud/cli.py +540 -0
- firecloud/crypto.py +269 -0
- firecloud/discovery.py +164 -0
- firecloud/distributor.py +269 -0
- firecloud/exceptions.py +41 -0
- firecloud/fec.py +87 -0
- firecloud/manifest.py +263 -0
- firecloud/network.py +90 -0
- firecloud/node.py +562 -0
- firecloud/storage.py +146 -0
- firecloud/sync.py +277 -0
- firecloud/transport.py +387 -0
- firecloud_devnet-0.1.0.dist-info/METADATA +158 -0
- firecloud_devnet-0.1.0.dist-info/RECORD +34 -0
- firecloud_devnet-0.1.0.dist-info/WHEEL +4 -0
- firecloud_devnet-0.1.0.dist-info/entry_points.txt +4 -0
- firecloud_devnet-0.1.0.dist-info/licenses/LICENSE +21 -0
fc_mlops/__init__.py
ADDED
fc_mlops/__main__.py
ADDED
fc_mlops/anomaly.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""IsolationForest-based anomaly scoring on telemetry readings."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from pydantic import BaseModel, ConfigDict
|
|
9
|
+
from sklearn.ensemble import IsolationForest
|
|
10
|
+
|
|
11
|
+
_LOG_PATH = Path.home() / ".fc_mlops" / "telemetry_log.jsonl"
|
|
12
|
+
_ALERTS_PATH = Path.home() / ".fc_mlops" / "alerts.jsonl"
|
|
13
|
+
|
|
14
|
+
# columns pulled from each telemetry reading
|
|
15
|
+
_FEATURES = [
|
|
16
|
+
"disk_io_read_mbps",
|
|
17
|
+
"chunk_upload_latency_ms",
|
|
18
|
+
"cpu_percent",
|
|
19
|
+
"memory_percent",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AnomalyReport(BaseModel):
|
|
24
|
+
model_config = ConfigDict(frozen=True)
|
|
25
|
+
|
|
26
|
+
timestamp: datetime
|
|
27
|
+
is_anomaly: bool
|
|
28
|
+
anomaly_score: float
|
|
29
|
+
flagged_metrics: list[str]
|
|
30
|
+
recommendation: str
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _append_alert(report: AnomalyReport) -> None:
|
|
34
|
+
_ALERTS_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
with open(_ALERTS_PATH, "a", encoding="utf-8") as f:
|
|
36
|
+
f.write(json.dumps(report.model_dump(), default=str) + "\n")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _load_readings(log_path: Path | None = None, max_lines: int = 200) -> list[dict]:
|
|
40
|
+
path = log_path or _LOG_PATH
|
|
41
|
+
if not path.exists():
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
lines = path.read_text(encoding="utf-8").strip().splitlines()
|
|
45
|
+
tail = lines[-max_lines:] if len(lines) > max_lines else lines
|
|
46
|
+
|
|
47
|
+
readings = []
|
|
48
|
+
for line in tail:
|
|
49
|
+
try:
|
|
50
|
+
readings.append(json.loads(line))
|
|
51
|
+
except json.JSONDecodeError:
|
|
52
|
+
continue
|
|
53
|
+
return readings
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def check_anomaly(log_path: Path | None = None) -> AnomalyReport | dict:
|
|
57
|
+
"""Run anomaly detection against the latest telemetry data.
|
|
58
|
+
|
|
59
|
+
Pass *log_path* to override the default telemetry log location
|
|
60
|
+
(used by the simulation script and tests).
|
|
61
|
+
"""
|
|
62
|
+
readings = _load_readings(log_path)
|
|
63
|
+
|
|
64
|
+
if len(readings) < 50:
|
|
65
|
+
return {"status": "insufficient_data", "readings": len(readings)}
|
|
66
|
+
|
|
67
|
+
# build feature matrix
|
|
68
|
+
data = []
|
|
69
|
+
for r in readings:
|
|
70
|
+
row = [float(r.get(f, 0.0)) for f in _FEATURES]
|
|
71
|
+
data.append(row)
|
|
72
|
+
X = np.array(data)
|
|
73
|
+
|
|
74
|
+
clf = IsolationForest(contamination=0.05, random_state=42)
|
|
75
|
+
clf.fit(X)
|
|
76
|
+
|
|
77
|
+
latest = X[-1].reshape(1, -1)
|
|
78
|
+
prediction = clf.predict(latest)[0] # -1 = anomaly, 1 = normal
|
|
79
|
+
score = clf.decision_function(latest)[0]
|
|
80
|
+
is_anomaly = prediction == -1
|
|
81
|
+
|
|
82
|
+
# flag anything > 2 stddev from mean
|
|
83
|
+
means = X.mean(axis=0)
|
|
84
|
+
stds = X.std(axis=0)
|
|
85
|
+
flagged: list[str] = []
|
|
86
|
+
for i, feat in enumerate(_FEATURES):
|
|
87
|
+
if stds[i] > 0 and abs(X[-1, i] - means[i]) > 2 * stds[i]:
|
|
88
|
+
flagged.append(feat)
|
|
89
|
+
|
|
90
|
+
if not is_anomaly:
|
|
91
|
+
rec = "Node healthy"
|
|
92
|
+
elif "chunk_upload_latency_ms" in flagged:
|
|
93
|
+
rec = "High latency — check network"
|
|
94
|
+
elif "cpu_percent" in flagged:
|
|
95
|
+
rec = "CPU spike — check running processes"
|
|
96
|
+
elif "disk_io_read_mbps" in flagged:
|
|
97
|
+
rec = "Disk I/O degraded — check storage health"
|
|
98
|
+
else:
|
|
99
|
+
rec = "Anomalous reading — investigate node"
|
|
100
|
+
|
|
101
|
+
report = AnomalyReport(
|
|
102
|
+
timestamp=datetime.now(timezone.utc),
|
|
103
|
+
is_anomaly=is_anomaly,
|
|
104
|
+
anomaly_score=round(float(score), 4),
|
|
105
|
+
flagged_metrics=flagged,
|
|
106
|
+
recommendation=rec,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if is_anomaly:
|
|
110
|
+
_append_alert(report)
|
|
111
|
+
|
|
112
|
+
return report
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Version-tracked ML artifact storage backed by FireCloud's Node API."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict
|
|
9
|
+
|
|
10
|
+
_MANIFEST_PATH = Path.home() / ".fc_mlops" / "artifacts.json"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ArtifactMetadata(BaseModel):
|
|
14
|
+
"""Immutable metadata record for a stored ML artifact."""
|
|
15
|
+
|
|
16
|
+
model_config = ConfigDict(frozen=True)
|
|
17
|
+
|
|
18
|
+
name: str
|
|
19
|
+
version: str
|
|
20
|
+
artifact_type: Literal["model", "dataset", "checkpoint"]
|
|
21
|
+
saved_at: datetime
|
|
22
|
+
file_size_bytes: int
|
|
23
|
+
metrics: dict[str, float]
|
|
24
|
+
tags: list[str]
|
|
25
|
+
firecloud_file_id: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _load_manifest() -> list[dict]:
|
|
29
|
+
if not _MANIFEST_PATH.exists():
|
|
30
|
+
return []
|
|
31
|
+
try:
|
|
32
|
+
return json.loads(_MANIFEST_PATH.read_text(encoding="utf-8"))
|
|
33
|
+
except (json.JSONDecodeError, OSError):
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _save_manifest(entries: list[dict]) -> None:
|
|
38
|
+
_MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
_MANIFEST_PATH.write_text(
|
|
40
|
+
json.dumps(entries, indent=2, default=str),
|
|
41
|
+
encoding="utf-8",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def save_artifact(
|
|
46
|
+
node,
|
|
47
|
+
local_path: Path,
|
|
48
|
+
name: str,
|
|
49
|
+
version: str,
|
|
50
|
+
artifact_type: str,
|
|
51
|
+
metrics: dict[str, float] | None = None,
|
|
52
|
+
tags: list[str] | None = None,
|
|
53
|
+
) -> ArtifactMetadata:
|
|
54
|
+
"""Upload *local_path* to FireCloud and record metadata in the manifest.
|
|
55
|
+
|
|
56
|
+
Returns the :class:`ArtifactMetadata` for the saved artifact.
|
|
57
|
+
"""
|
|
58
|
+
local_path = Path(local_path)
|
|
59
|
+
file_id = await node.upload(local_path)
|
|
60
|
+
|
|
61
|
+
metadata = ArtifactMetadata(
|
|
62
|
+
name=name,
|
|
63
|
+
version=version,
|
|
64
|
+
artifact_type=artifact_type,
|
|
65
|
+
saved_at=datetime.now(timezone.utc),
|
|
66
|
+
file_size_bytes=local_path.stat().st_size,
|
|
67
|
+
metrics=metrics or {},
|
|
68
|
+
tags=tags or [],
|
|
69
|
+
firecloud_file_id=file_id,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
entries = _load_manifest()
|
|
73
|
+
entries.append(metadata.model_dump())
|
|
74
|
+
_save_manifest(entries)
|
|
75
|
+
return metadata
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def load_artifact(
|
|
79
|
+
node,
|
|
80
|
+
name: str,
|
|
81
|
+
version: str,
|
|
82
|
+
destination: Path,
|
|
83
|
+
) -> Path:
|
|
84
|
+
"""Download an artifact by name+version from the manifest."""
|
|
85
|
+
entries = _load_manifest()
|
|
86
|
+
|
|
87
|
+
match = None
|
|
88
|
+
for entry in entries:
|
|
89
|
+
if entry["name"] == name and entry["version"] == version:
|
|
90
|
+
match = entry
|
|
91
|
+
break
|
|
92
|
+
|
|
93
|
+
if match is None:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"Artifact '{name}' version '{version}' not found in manifest"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
destination = Path(destination)
|
|
99
|
+
await node.download(match["firecloud_file_id"], destination)
|
|
100
|
+
return destination.resolve()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def list_artifacts(artifact_type: str | None = None) -> list[ArtifactMetadata]:
|
|
104
|
+
"""Return tracked artifacts, optionally filtered by *artifact_type*."""
|
|
105
|
+
entries = _load_manifest()
|
|
106
|
+
results = []
|
|
107
|
+
for entry in entries:
|
|
108
|
+
if artifact_type and entry.get("artifact_type") != artifact_type:
|
|
109
|
+
continue
|
|
110
|
+
results.append(ArtifactMetadata(**entry))
|
|
111
|
+
return results
|
fc_mlops/cli.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""fc-ml CLI — artifact management, telemetry, and anomaly detection."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@click.group()
|
|
9
|
+
def cli():
|
|
10
|
+
"""fc-ml — MLOps extensions for FireCloud."""
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# --- Artifact commands ---
|
|
15
|
+
|
|
16
|
+
@cli.command()
|
|
17
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
18
|
+
@click.option("--name", "-n", required=True, help="Artifact name.")
|
|
19
|
+
@click.option("--version", "-v", required=True, help="Artifact version.")
|
|
20
|
+
@click.option(
|
|
21
|
+
"--type", "artifact_type",
|
|
22
|
+
type=click.Choice(["model", "dataset", "checkpoint"]),
|
|
23
|
+
required=True,
|
|
24
|
+
help="Artifact type.",
|
|
25
|
+
)
|
|
26
|
+
@click.option(
|
|
27
|
+
"--metric", "-m",
|
|
28
|
+
multiple=True,
|
|
29
|
+
help="Metric in key=value format (repeatable).",
|
|
30
|
+
)
|
|
31
|
+
@click.option("--passphrase", prompt=True, hide_input=True, help="Network passphrase.")
|
|
32
|
+
@click.option("--port", default=7474, type=int)
|
|
33
|
+
@click.option("--storage", default=None, type=click.Path())
|
|
34
|
+
def save(
|
|
35
|
+
path: str,
|
|
36
|
+
name: str,
|
|
37
|
+
version: str,
|
|
38
|
+
artifact_type: str,
|
|
39
|
+
metric: tuple[str, ...],
|
|
40
|
+
passphrase: str,
|
|
41
|
+
port: int,
|
|
42
|
+
storage: str | None,
|
|
43
|
+
):
|
|
44
|
+
"""Save an artifact to the FireCloud network."""
|
|
45
|
+
import asyncio
|
|
46
|
+
from firecloud import Network, Node
|
|
47
|
+
from fc_mlops.artifact_store import save_artifact
|
|
48
|
+
|
|
49
|
+
metrics = {}
|
|
50
|
+
for m in metric:
|
|
51
|
+
if "=" in m:
|
|
52
|
+
k, v = m.split("=", 1)
|
|
53
|
+
metrics[k.strip()] = float(v.strip())
|
|
54
|
+
|
|
55
|
+
storage_path = Path(storage) if storage else Path.home() / ".firecloud" / "storage"
|
|
56
|
+
|
|
57
|
+
async def _run():
|
|
58
|
+
net = Network.load(Path.home() / ".firecloud" / "network.key", passphrase)
|
|
59
|
+
node = Node(network=net, storage_path=storage_path, port=port, enable_discovery=False)
|
|
60
|
+
await node.start()
|
|
61
|
+
try:
|
|
62
|
+
meta = await save_artifact(
|
|
63
|
+
node, Path(path), name, version, artifact_type, metrics, []
|
|
64
|
+
)
|
|
65
|
+
click.echo(click.style("✓ Artifact saved.", fg="green"))
|
|
66
|
+
click.echo(f" Name : {meta.name}")
|
|
67
|
+
click.echo(f" Version : {meta.version}")
|
|
68
|
+
click.echo(f" File ID : {meta.firecloud_file_id}")
|
|
69
|
+
click.echo(f" Size : {meta.file_size_bytes} bytes")
|
|
70
|
+
finally:
|
|
71
|
+
await node.stop()
|
|
72
|
+
|
|
73
|
+
asyncio.run(_run())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@cli.command()
|
|
77
|
+
@click.argument("name")
|
|
78
|
+
@click.option("--version", "-v", required=True, help="Artifact version.")
|
|
79
|
+
@click.option("--dest", "-d", required=True, type=click.Path(), help="Destination path.")
|
|
80
|
+
@click.option("--passphrase", prompt=True, hide_input=True, help="Network passphrase.")
|
|
81
|
+
@click.option("--port", default=7474, type=int)
|
|
82
|
+
@click.option("--storage", default=None, type=click.Path())
|
|
83
|
+
def load(
|
|
84
|
+
name: str,
|
|
85
|
+
version: str,
|
|
86
|
+
dest: str,
|
|
87
|
+
passphrase: str,
|
|
88
|
+
port: int,
|
|
89
|
+
storage: str | None,
|
|
90
|
+
):
|
|
91
|
+
"""Load an artifact from the FireCloud network."""
|
|
92
|
+
import asyncio
|
|
93
|
+
from firecloud import Network, Node
|
|
94
|
+
from fc_mlops.artifact_store import load_artifact
|
|
95
|
+
|
|
96
|
+
storage_path = Path(storage) if storage else Path.home() / ".firecloud" / "storage"
|
|
97
|
+
|
|
98
|
+
async def _run():
|
|
99
|
+
net = Network.load(Path.home() / ".firecloud" / "network.key", passphrase)
|
|
100
|
+
node = Node(network=net, storage_path=storage_path, port=port, enable_discovery=False)
|
|
101
|
+
await node.start()
|
|
102
|
+
try:
|
|
103
|
+
result_path = await load_artifact(node, name, version, Path(dest))
|
|
104
|
+
click.echo(click.style(f"✓ Artifact downloaded to {result_path}", fg="green"))
|
|
105
|
+
finally:
|
|
106
|
+
await node.stop()
|
|
107
|
+
|
|
108
|
+
asyncio.run(_run())
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@cli.command("list")
|
|
112
|
+
@click.option(
|
|
113
|
+
"--type", "artifact_type",
|
|
114
|
+
type=click.Choice(["model", "dataset", "checkpoint"]),
|
|
115
|
+
default=None,
|
|
116
|
+
help="Filter by artifact type.",
|
|
117
|
+
)
|
|
118
|
+
def list_artifacts(artifact_type: str | None):
|
|
119
|
+
"""List tracked ML artifacts."""
|
|
120
|
+
from fc_mlops.artifact_store import list_artifacts as _list
|
|
121
|
+
|
|
122
|
+
artifacts = _list(artifact_type)
|
|
123
|
+
if not artifacts:
|
|
124
|
+
click.echo("No artifacts found.")
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
click.echo(
|
|
128
|
+
click.style(
|
|
129
|
+
f"{'Name':<20} {'Version':<10} {'Type':<12} {'Size':<12} {'File ID':<20}",
|
|
130
|
+
bold=True,
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
click.echo("─" * 75)
|
|
134
|
+
for a in artifacts:
|
|
135
|
+
size = f"{a.file_size_bytes:,} B"
|
|
136
|
+
click.echo(
|
|
137
|
+
f"{a.name:<20} {a.version:<10} {a.artifact_type:<12} "
|
|
138
|
+
f"{size:<12} {a.firecloud_file_id[:20]}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# --- Telemetry ---
|
|
143
|
+
|
|
144
|
+
@cli.group()
|
|
145
|
+
def telemetry():
|
|
146
|
+
"""Telemetry server commands."""
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
@telemetry.command("start")
|
|
150
|
+
def telemetry_start():
|
|
151
|
+
"""Start the telemetry metrics server on localhost:7475."""
|
|
152
|
+
from fc_mlops.telemetry import start_server
|
|
153
|
+
click.echo(click.style("Starting telemetry server on http://127.0.0.1:7475", fg="cyan"))
|
|
154
|
+
start_server()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# --- Anomaly detection ---
|
|
158
|
+
|
|
159
|
+
@cli.group()
|
|
160
|
+
def anomaly():
|
|
161
|
+
"""Anomaly detection commands."""
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
@anomaly.command("check")
|
|
165
|
+
def anomaly_check():
|
|
166
|
+
"""Run anomaly detection on recent telemetry data."""
|
|
167
|
+
from fc_mlops.anomaly import check_anomaly
|
|
168
|
+
|
|
169
|
+
result = check_anomaly()
|
|
170
|
+
|
|
171
|
+
if isinstance(result, dict):
|
|
172
|
+
click.echo(f"Insufficient data: {result.get('readings', 0)} readings (need ≥ 50)")
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
click.echo(click.style("Anomaly Detection Results", bold=True))
|
|
176
|
+
click.echo(f" Anomaly detected : {'Yes' if result.is_anomaly else 'No'}")
|
|
177
|
+
click.echo(f" Anomaly score : {result.anomaly_score}")
|
|
178
|
+
click.echo(f" Flagged metrics : {', '.join(result.flagged_metrics) or 'None'}")
|
|
179
|
+
click.echo(f" Recommendation : {result.recommendation}")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@cli.command("simulate-failure")
|
|
183
|
+
def simulate_failure():
|
|
184
|
+
"""Run the failure simulation demo."""
|
|
185
|
+
from fc_mlops.simulate_failure import main
|
|
186
|
+
main()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
if __name__ == "__main__":
|
|
190
|
+
cli()
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Standalone failure simulation demo.
|
|
2
|
+
|
|
3
|
+
Generates synthetic telemetry, injects anomalies, and runs detection.
|
|
4
|
+
Run via: python -m fc_mlops.simulate_failure
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import random
|
|
9
|
+
import tempfile
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
|
|
16
|
+
from fc_mlops.anomaly import check_anomaly
|
|
17
|
+
|
|
18
|
+
console = Console()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _write_reading(log_path: Path, reading: dict) -> None:
|
|
22
|
+
with open(log_path, "a", encoding="utf-8") as f:
|
|
23
|
+
f.write(json.dumps(reading, default=str) + "\n")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _normal_reading() -> dict:
|
|
27
|
+
return {
|
|
28
|
+
"node_id": "sim-node",
|
|
29
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
30
|
+
"disk_io_read_mbps": round(random.uniform(50, 150), 2),
|
|
31
|
+
"disk_io_write_mbps": round(random.uniform(30, 100), 2),
|
|
32
|
+
"chunk_upload_latency_ms": round(random.uniform(20, 50), 2),
|
|
33
|
+
"active_connections": random.randint(1, 5),
|
|
34
|
+
"storage_used_percent": round(random.uniform(30, 60), 2),
|
|
35
|
+
"cpu_percent": round(random.uniform(10, 30), 2),
|
|
36
|
+
"memory_percent": round(random.uniform(40, 60), 2),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _anomalous_reading() -> dict:
|
|
41
|
+
return {
|
|
42
|
+
"node_id": "sim-node",
|
|
43
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
44
|
+
"disk_io_read_mbps": round(random.uniform(5, 15), 2),
|
|
45
|
+
"disk_io_write_mbps": round(random.uniform(1, 5), 2),
|
|
46
|
+
"chunk_upload_latency_ms": round(random.uniform(400, 600), 2),
|
|
47
|
+
"active_connections": random.randint(0, 1),
|
|
48
|
+
"storage_used_percent": round(random.uniform(85, 98), 2),
|
|
49
|
+
"cpu_percent": round(random.uniform(85, 95), 2),
|
|
50
|
+
"memory_percent": round(random.uniform(80, 95), 2),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def main() -> None:
|
|
55
|
+
tmp_dir = Path(tempfile.mkdtemp(prefix="fc_mlops_sim_"))
|
|
56
|
+
log_path = tmp_dir / "telemetry_log.jsonl"
|
|
57
|
+
|
|
58
|
+
# baseline
|
|
59
|
+
console.print("[bold cyan][Phase 1][/bold cyan] Generating 60 baseline readings...")
|
|
60
|
+
for _ in range(60):
|
|
61
|
+
_write_reading(log_path, _normal_reading())
|
|
62
|
+
|
|
63
|
+
# inject failures
|
|
64
|
+
console.print("[bold yellow][Phase 2][/bold yellow] Injecting failure signatures...")
|
|
65
|
+
for _ in range(10):
|
|
66
|
+
_write_reading(log_path, _anomalous_reading())
|
|
67
|
+
|
|
68
|
+
# detect
|
|
69
|
+
console.print("[bold magenta][Phase 3][/bold magenta] Running anomaly detection...")
|
|
70
|
+
result = check_anomaly(log_path=log_path)
|
|
71
|
+
|
|
72
|
+
if isinstance(result, dict):
|
|
73
|
+
console.print(f"[red]Insufficient data: {result}[/red]")
|
|
74
|
+
console.print("[bold red]✗ FAIL: Not enough readings for detection[/bold red]")
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
table = Table(title="Anomaly Detection Results", show_header=True)
|
|
78
|
+
table.add_column("Metric", style="cyan", width=25)
|
|
79
|
+
table.add_column("Value", style="white", width=50)
|
|
80
|
+
|
|
81
|
+
table.add_row("Anomaly detected", "[red]Yes[/red]" if result.is_anomaly else "[green]No[/green]")
|
|
82
|
+
table.add_row("Anomaly score", str(round(result.anomaly_score, 4)))
|
|
83
|
+
table.add_row("Flagged metrics", ", ".join(result.flagged_metrics) if result.flagged_metrics else "None")
|
|
84
|
+
table.add_row("Recommendation", result.recommendation)
|
|
85
|
+
|
|
86
|
+
console.print()
|
|
87
|
+
console.print(table)
|
|
88
|
+
console.print()
|
|
89
|
+
|
|
90
|
+
if result.is_anomaly:
|
|
91
|
+
console.print("[bold green]✓ PASS: Anomaly correctly detected[/bold green]")
|
|
92
|
+
else:
|
|
93
|
+
console.print(
|
|
94
|
+
"[bold red]✗ FAIL: Anomaly not detected — "
|
|
95
|
+
"check contamination parameter[/bold red]"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
main()
|
fc_mlops/telemetry.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""FastAPI metrics endpoint with psutil system monitoring."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import psutil
|
|
9
|
+
from fastapi import FastAPI
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
_LOG_PATH = Path.home() / ".fc_mlops" / "telemetry_log.jsonl"
|
|
13
|
+
|
|
14
|
+
app = FastAPI(title="FireCloud Telemetry", version="0.1.0")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NodeMetrics(BaseModel):
|
|
18
|
+
"""Snapshot of system and node health metrics."""
|
|
19
|
+
node_id: str
|
|
20
|
+
timestamp: datetime
|
|
21
|
+
disk_io_read_mbps: float
|
|
22
|
+
disk_io_write_mbps: float
|
|
23
|
+
chunk_upload_latency_ms: float
|
|
24
|
+
active_connections: int
|
|
25
|
+
storage_used_percent: float
|
|
26
|
+
cpu_percent: float
|
|
27
|
+
memory_percent: float
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _collect_metrics() -> NodeMetrics:
|
|
31
|
+
# disk I/O — sample over a short window
|
|
32
|
+
disk1 = psutil.disk_io_counters()
|
|
33
|
+
if disk1:
|
|
34
|
+
time.sleep(0.1)
|
|
35
|
+
disk2 = psutil.disk_io_counters()
|
|
36
|
+
read_mbps = (disk2.read_bytes - disk1.read_bytes) / 0.1 / (1024 * 1024)
|
|
37
|
+
write_mbps = (disk2.write_bytes - disk1.write_bytes) / 0.1 / (1024 * 1024)
|
|
38
|
+
else:
|
|
39
|
+
read_mbps = write_mbps = 0.0
|
|
40
|
+
|
|
41
|
+
disk_usage = psutil.disk_usage("/")
|
|
42
|
+
|
|
43
|
+
return NodeMetrics(
|
|
44
|
+
node_id="local",
|
|
45
|
+
timestamp=datetime.now(timezone.utc),
|
|
46
|
+
disk_io_read_mbps=round(read_mbps, 2),
|
|
47
|
+
disk_io_write_mbps=round(write_mbps, 2),
|
|
48
|
+
chunk_upload_latency_ms=0.0,
|
|
49
|
+
active_connections=0,
|
|
50
|
+
storage_used_percent=round(disk_usage.percent, 2),
|
|
51
|
+
cpu_percent=round(psutil.cpu_percent(interval=None), 2),
|
|
52
|
+
memory_percent=round(psutil.virtual_memory().percent, 2),
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@app.get("/metrics", response_model=NodeMetrics)
|
|
57
|
+
def get_metrics() -> NodeMetrics:
|
|
58
|
+
"""Collect and return current system metrics."""
|
|
59
|
+
metrics = _collect_metrics()
|
|
60
|
+
|
|
61
|
+
# append to JSONL log
|
|
62
|
+
_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
with open(_LOG_PATH, "a", encoding="utf-8") as fh:
|
|
64
|
+
fh.write(json.dumps(metrics.model_dump(), default=str) + "\n")
|
|
65
|
+
|
|
66
|
+
return metrics
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def start_server() -> None:
|
|
70
|
+
"""Start the telemetry server on localhost:7475."""
|
|
71
|
+
import uvicorn
|
|
72
|
+
uvicorn.run(app, host="127.0.0.1", port=7475)
|
fc_rag/__init__.py
ADDED
fc_rag/cli.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""fc-rag CLI — index files and query the local RAG pipeline."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@click.group()
|
|
8
|
+
def cli():
|
|
9
|
+
"""fc-rag — Private RAG pipeline for FireCloud docs."""
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@cli.command()
|
|
14
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
15
|
+
def index(path: str):
|
|
16
|
+
"""Index files at PATH into the local vector store."""
|
|
17
|
+
from fc_rag.indexer import index_path
|
|
18
|
+
|
|
19
|
+
target = Path(path)
|
|
20
|
+
total_chunks = index_path(target)
|
|
21
|
+
|
|
22
|
+
if target.is_file():
|
|
23
|
+
file_count = 1
|
|
24
|
+
else:
|
|
25
|
+
supported = {".txt", ".md", ".py", ".json"}
|
|
26
|
+
file_count = sum(
|
|
27
|
+
1 for f in target.rglob("*")
|
|
28
|
+
if f.is_file() and f.suffix in supported
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
click.echo(f"Indexed {total_chunks} chunks from {file_count} files")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@cli.command()
|
|
35
|
+
@click.argument("question")
|
|
36
|
+
def query(question: str):
|
|
37
|
+
"""Query the local RAG pipeline with a natural-language question."""
|
|
38
|
+
from fc_rag.query_engine import query as run_query
|
|
39
|
+
from fc_rag.retriever import retrieve
|
|
40
|
+
|
|
41
|
+
answer = run_query(question)
|
|
42
|
+
click.echo(answer)
|
|
43
|
+
|
|
44
|
+
results = retrieve(question)
|
|
45
|
+
if results:
|
|
46
|
+
sources = sorted(set(r.filename for r in results))
|
|
47
|
+
click.echo(f"\nSources: {', '.join(sources)}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
cli()
|
fc_rag/config.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Pydantic settings for the fc_rag pipeline."""
|
|
2
|
+
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from pydantic import BaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Settings(BaseModel):
|
|
9
|
+
"""All paths default to ~/.fc_rag/ so it works out of the box."""
|
|
10
|
+
|
|
11
|
+
model_config = ConfigDict(frozen=True)
|
|
12
|
+
|
|
13
|
+
ollama_model: str = "llama3.2:3b"
|
|
14
|
+
embedding_model: str = "BAAI/bge-small-en-v1.5"
|
|
15
|
+
qdrant_path: Path = Path.home() / ".fc_rag" / "vectors"
|
|
16
|
+
collection_name: str = "firecloud_docs"
|
|
17
|
+
top_k: int = 5
|
|
18
|
+
max_retries: int = 3
|
|
19
|
+
log_path: Path = Path.home() / ".fc_rag" / "query_log.jsonl"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@lru_cache(maxsize=1)
|
|
23
|
+
def get_settings() -> Settings:
|
|
24
|
+
return Settings()
|