arkaos 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/VERSION +1 -1
- package/config/constitution.yaml +2 -0
- package/config/hooks/user-prompt-submit-v2.sh +11 -0
- package/core/budget/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/budget/__pycache__/manager.cpython-313.pyc +0 -0
- package/core/budget/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/knowledge/__init__.py +6 -0
- package/core/knowledge/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/chunker.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/embedder.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/indexer.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/ingest.cpython-313.pyc +0 -0
- package/core/knowledge/__pycache__/vector_store.cpython-313.pyc +0 -0
- package/core/knowledge/chunker.py +121 -0
- package/core/knowledge/embedder.py +52 -0
- package/core/knowledge/indexer.py +97 -0
- package/core/knowledge/ingest.py +270 -0
- package/core/knowledge/vector_store.py +213 -0
- package/core/obsidian/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/obsidian/__pycache__/templates.cpython-313.pyc +0 -0
- package/core/obsidian/__pycache__/writer.cpython-313.pyc +0 -0
- package/core/orchestration/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/orchestration/__pycache__/patterns.cpython-313.pyc +0 -0
- package/core/orchestration/__pycache__/protocol.cpython-313.pyc +0 -0
- package/core/runtime/__pycache__/subagent.cpython-313.pyc +0 -0
- package/core/runtime/subagent.py +5 -0
- package/core/squads/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/squads/schema.py +3 -0
- package/core/squads/templates/project-squad.yaml +28 -0
- package/core/synapse/__pycache__/engine.cpython-313.pyc +0 -0
- package/core/synapse/__pycache__/layers.cpython-313.pyc +0 -0
- package/core/synapse/engine.py +5 -1
- package/core/synapse/layers.py +95 -9
- package/core/tasks/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/tasks/schema.py +1 -0
- package/core/workflow/__pycache__/engine.cpython-313.pyc +0 -0
- package/core/workflow/__pycache__/schema.cpython-313.pyc +0 -0
- package/departments/dev/agents/research-assistant.yaml +51 -0
- package/departments/kb/agents/data-collector.yaml +51 -0
- package/departments/ops/agents/doc-writer.yaml +51 -0
- package/departments/pm/agents/pm-director.yaml +1 -1
- package/installer/cli.js +49 -0
- package/installer/init.js +105 -0
- package/installer/migrate.js +4 -1
- package/package.json +1 -1
- package/pyproject.toml +16 -1
package/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2.0
|
|
1
|
+
2.1.0
|
package/config/constitution.yaml
CHANGED
|
@@ -60,6 +60,8 @@ enforcement_levels:
|
|
|
60
60
|
|
|
61
61
|
quality_gate:
|
|
62
62
|
description: "Mandatory pre-delivery review. Nothing ships without APPROVED verdict."
|
|
63
|
+
trigger: "After the last execution phase, before delivery to user"
|
|
64
|
+
frequency: "Once per workflow execution, not per phase"
|
|
63
65
|
agents:
|
|
64
66
|
orchestrator:
|
|
65
67
|
id: cqo-marta
|
|
@@ -7,6 +7,17 @@
|
|
|
7
7
|
|
|
8
8
|
input=$(cat)
|
|
9
9
|
|
|
10
|
+
# ─── V1 Migration Detection ─────────────────────────────────────────────
|
|
11
|
+
V1_PATHS=("$HOME/.claude/skills/arka-os" "$HOME/.claude/skills/arkaos")
|
|
12
|
+
MIGRATION_MARKER="$HOME/.arkaos/migrated-from-v1"
|
|
13
|
+
|
|
14
|
+
for v1_path in "${V1_PATHS[@]}"; do
|
|
15
|
+
if [ -d "$v1_path" ] && [ ! -f "$MIGRATION_MARKER" ]; then
|
|
16
|
+
echo "{\"additionalContext\": \"[MIGRATION] ArkaOS v1 detected at $v1_path. Run: npx arkaos migrate — This will backup v1, preserve your data, and install v2. See: https://github.com/andreagroferreira/arka-os#install\"}"
|
|
17
|
+
exit 0
|
|
18
|
+
fi
|
|
19
|
+
done
|
|
20
|
+
|
|
10
21
|
# ─── Performance Timing ──────────────────────────────────────────────────
|
|
11
22
|
_HOOK_START_NS=$(date +%s%N 2>/dev/null || echo "0")
|
|
12
23
|
_hook_ms() {
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Markdown chunker — split documents into embeddable chunks.
|
|
2
|
+
|
|
3
|
+
Splits on paragraph boundaries, respects heading structure,
|
|
4
|
+
and maintains overlap for context continuity.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Chunk:
|
|
13
|
+
"""A text chunk ready for embedding."""
|
|
14
|
+
text: str
|
|
15
|
+
heading: str = "" # Current heading context
|
|
16
|
+
index: int = 0 # Position in document
|
|
17
|
+
source: str = "" # Source file path
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def token_estimate(self) -> int:
|
|
21
|
+
return len(self.text.split())
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def chunk_markdown(
|
|
25
|
+
content: str,
|
|
26
|
+
max_tokens: int = 512,
|
|
27
|
+
overlap_tokens: int = 50,
|
|
28
|
+
source: str = "",
|
|
29
|
+
) -> list[Chunk]:
|
|
30
|
+
"""Split markdown content into chunks at paragraph boundaries.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
content: Markdown text to chunk.
|
|
34
|
+
max_tokens: Maximum tokens per chunk.
|
|
35
|
+
overlap_tokens: Token overlap between consecutive chunks.
|
|
36
|
+
source: Source file path for metadata.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
List of Chunk objects.
|
|
40
|
+
"""
|
|
41
|
+
# Strip frontmatter
|
|
42
|
+
body = content
|
|
43
|
+
if content.startswith("---"):
|
|
44
|
+
end = content.find("---", 3)
|
|
45
|
+
if end != -1:
|
|
46
|
+
body = content[end + 3:].strip()
|
|
47
|
+
|
|
48
|
+
# Split into paragraphs (double newline) preserving headings
|
|
49
|
+
blocks = re.split(r'\n\n+', body)
|
|
50
|
+
blocks = [b.strip() for b in blocks if b.strip()]
|
|
51
|
+
|
|
52
|
+
chunks: list[Chunk] = []
|
|
53
|
+
current_heading = ""
|
|
54
|
+
current_text = ""
|
|
55
|
+
current_tokens = 0
|
|
56
|
+
|
|
57
|
+
for block in blocks:
|
|
58
|
+
# Track headings
|
|
59
|
+
heading_match = re.match(r'^(#{1,6})\s+(.+)', block)
|
|
60
|
+
if heading_match:
|
|
61
|
+
current_heading = heading_match.group(2)
|
|
62
|
+
|
|
63
|
+
block_tokens = len(block.split())
|
|
64
|
+
|
|
65
|
+
# If single block exceeds max, split it
|
|
66
|
+
if block_tokens > max_tokens:
|
|
67
|
+
if current_text:
|
|
68
|
+
chunks.append(Chunk(
|
|
69
|
+
text=current_text.strip(),
|
|
70
|
+
heading=current_heading,
|
|
71
|
+
index=len(chunks),
|
|
72
|
+
source=source,
|
|
73
|
+
))
|
|
74
|
+
current_text = ""
|
|
75
|
+
current_tokens = 0
|
|
76
|
+
|
|
77
|
+
# Split large block by sentences
|
|
78
|
+
sentences = re.split(r'(?<=[.!?])\s+', block)
|
|
79
|
+
for sentence in sentences:
|
|
80
|
+
sent_tokens = len(sentence.split())
|
|
81
|
+
if current_tokens + sent_tokens > max_tokens and current_text:
|
|
82
|
+
chunks.append(Chunk(
|
|
83
|
+
text=current_text.strip(),
|
|
84
|
+
heading=current_heading,
|
|
85
|
+
index=len(chunks),
|
|
86
|
+
source=source,
|
|
87
|
+
))
|
|
88
|
+
# Overlap: keep last few words
|
|
89
|
+
words = current_text.split()
|
|
90
|
+
current_text = " ".join(words[-overlap_tokens:]) + " " if len(words) > overlap_tokens else ""
|
|
91
|
+
current_tokens = len(current_text.split())
|
|
92
|
+
current_text += sentence + " "
|
|
93
|
+
current_tokens += sent_tokens
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# Check if adding this block exceeds limit
|
|
97
|
+
if current_tokens + block_tokens > max_tokens and current_text:
|
|
98
|
+
chunks.append(Chunk(
|
|
99
|
+
text=current_text.strip(),
|
|
100
|
+
heading=current_heading,
|
|
101
|
+
index=len(chunks),
|
|
102
|
+
source=source,
|
|
103
|
+
))
|
|
104
|
+
# Overlap
|
|
105
|
+
words = current_text.split()
|
|
106
|
+
current_text = " ".join(words[-overlap_tokens:]) + " " if len(words) > overlap_tokens else ""
|
|
107
|
+
current_tokens = len(current_text.split())
|
|
108
|
+
|
|
109
|
+
current_text += block + "\n\n"
|
|
110
|
+
current_tokens += block_tokens
|
|
111
|
+
|
|
112
|
+
# Final chunk
|
|
113
|
+
if current_text.strip():
|
|
114
|
+
chunks.append(Chunk(
|
|
115
|
+
text=current_text.strip(),
|
|
116
|
+
heading=current_heading,
|
|
117
|
+
index=len(chunks),
|
|
118
|
+
source=source,
|
|
119
|
+
))
|
|
120
|
+
|
|
121
|
+
return chunks
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Embedding wrapper — local embeddings via fastembed.
|
|
2
|
+
|
|
3
|
+
Graceful degradation: if fastembed is not installed, returns None
|
|
4
|
+
and the vector store falls back to keyword matching.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
# Lazy import — fastembed is optional
|
|
10
|
+
_model = None
|
|
11
|
+
_model_name = "BAAI/bge-small-en-v1.5" # 384 dims, fast, good quality
|
|
12
|
+
EMBEDDING_DIMS = 384
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_model():
|
|
16
|
+
"""Get or create the embedding model (lazy singleton)."""
|
|
17
|
+
global _model
|
|
18
|
+
if _model is None:
|
|
19
|
+
try:
|
|
20
|
+
from fastembed import TextEmbedding
|
|
21
|
+
_model = TextEmbedding(_model_name)
|
|
22
|
+
except ImportError:
|
|
23
|
+
return None
|
|
24
|
+
return _model
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def embed(text: str) -> Optional[list[float]]:
|
|
28
|
+
"""Embed a single text. Returns None if fastembed unavailable."""
|
|
29
|
+
model = get_model()
|
|
30
|
+
if model is None:
|
|
31
|
+
return None
|
|
32
|
+
results = list(model.embed([text]))
|
|
33
|
+
return results[0].tolist() if results else None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def embed_batch(texts: list[str]) -> Optional[list[list[float]]]:
|
|
37
|
+
"""Embed multiple texts. Returns None if fastembed unavailable."""
|
|
38
|
+
if not texts:
|
|
39
|
+
return []
|
|
40
|
+
model = get_model()
|
|
41
|
+
if model is None:
|
|
42
|
+
return None
|
|
43
|
+
return [emb.tolist() for emb in model.embed(texts)]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def is_available() -> bool:
|
|
47
|
+
"""Check if embedding model is available."""
|
|
48
|
+
try:
|
|
49
|
+
from fastembed import TextEmbedding
|
|
50
|
+
return True
|
|
51
|
+
except ImportError:
|
|
52
|
+
return False
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Knowledge indexer — walk directories and index markdown files.
|
|
2
|
+
|
|
3
|
+
Supports incremental indexing (skips already-indexed files by hash).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import hashlib
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Callable, Optional
|
|
9
|
+
|
|
10
|
+
from core.knowledge.chunker import chunk_markdown
|
|
11
|
+
from core.knowledge.vector_store import VectorStore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def file_hash(path: Path) -> str:
|
|
15
|
+
"""Compute SHA-256 hash of file content."""
|
|
16
|
+
return hashlib.sha256(path.read_bytes()).hexdigest()[:16]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def index_directory(
|
|
20
|
+
directory: str | Path,
|
|
21
|
+
store: VectorStore,
|
|
22
|
+
pattern: str = "**/*.md",
|
|
23
|
+
on_progress: Optional[Callable[[int, int, str], None]] = None,
|
|
24
|
+
max_tokens: int = 512,
|
|
25
|
+
skip_indexed: bool = True,
|
|
26
|
+
) -> dict:
|
|
27
|
+
"""Index all markdown files in a directory.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
directory: Root directory to scan.
|
|
31
|
+
store: VectorStore to index into.
|
|
32
|
+
pattern: Glob pattern for files.
|
|
33
|
+
on_progress: Callback(current, total, filename).
|
|
34
|
+
max_tokens: Max tokens per chunk.
|
|
35
|
+
skip_indexed: Skip files already indexed (by hash).
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dict with: files_scanned, files_indexed, files_skipped, chunks_created.
|
|
39
|
+
"""
|
|
40
|
+
root = Path(directory)
|
|
41
|
+
if not root.exists():
|
|
42
|
+
return {"files_scanned": 0, "files_indexed": 0, "files_skipped": 0, "chunks_created": 0}
|
|
43
|
+
|
|
44
|
+
files = sorted(root.glob(pattern))
|
|
45
|
+
# Skip hidden dirs (.obsidian, .git)
|
|
46
|
+
files = [f for f in files if not any(part.startswith(".") for part in f.relative_to(root).parts)]
|
|
47
|
+
|
|
48
|
+
total = len(files)
|
|
49
|
+
indexed = 0
|
|
50
|
+
skipped = 0
|
|
51
|
+
chunks_created = 0
|
|
52
|
+
|
|
53
|
+
for i, filepath in enumerate(files):
|
|
54
|
+
if on_progress:
|
|
55
|
+
on_progress(i + 1, total, filepath.name)
|
|
56
|
+
|
|
57
|
+
fhash = file_hash(filepath)
|
|
58
|
+
|
|
59
|
+
if skip_indexed and store.is_file_indexed(fhash):
|
|
60
|
+
skipped += 1
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
content = filepath.read_text(encoding="utf-8")
|
|
65
|
+
except (OSError, UnicodeDecodeError):
|
|
66
|
+
skipped += 1
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
# Skip very small files
|
|
70
|
+
if len(content.split()) < 20:
|
|
71
|
+
skipped += 1
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
# Remove old chunks for this file (re-index)
|
|
75
|
+
store.remove_file(str(filepath))
|
|
76
|
+
|
|
77
|
+
# Chunk and index
|
|
78
|
+
chunks = chunk_markdown(content, max_tokens=max_tokens, source=str(filepath))
|
|
79
|
+
if chunks:
|
|
80
|
+
texts = [c.text for c in chunks]
|
|
81
|
+
headings = [c.heading for c in chunks]
|
|
82
|
+
count = store.index_chunks(
|
|
83
|
+
texts=texts,
|
|
84
|
+
headings=headings,
|
|
85
|
+
source=str(filepath),
|
|
86
|
+
file_hash=fhash,
|
|
87
|
+
metadata={"relative_path": str(filepath.relative_to(root))},
|
|
88
|
+
)
|
|
89
|
+
chunks_created += count
|
|
90
|
+
indexed += 1
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"files_scanned": total,
|
|
94
|
+
"files_indexed": indexed,
|
|
95
|
+
"files_skipped": skipped,
|
|
96
|
+
"chunks_created": chunks_created,
|
|
97
|
+
}
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Knowledge ingest engine — process YouTube, PDF, audio, web, markdown.
|
|
2
|
+
|
|
3
|
+
Downloads, transcribes, extracts text, chunks, embeds, and indexes into
|
|
4
|
+
the vector store. Reports progress via callback for real-time UI updates.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import tempfile
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Callable, Optional
|
|
13
|
+
|
|
14
|
+
from core.knowledge.chunker import chunk_markdown
|
|
15
|
+
from core.knowledge.vector_store import VectorStore
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class IngestResult:
|
|
20
|
+
"""Result of an ingest operation."""
|
|
21
|
+
source: str
|
|
22
|
+
source_type: str
|
|
23
|
+
text_length: int = 0
|
|
24
|
+
chunks_created: int = 0
|
|
25
|
+
title: str = ""
|
|
26
|
+
error: str = ""
|
|
27
|
+
success: bool = True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
ProgressCallback = Callable[[int, str], None] # (percent, message)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def detect_source_type(source: str) -> str:
|
|
34
|
+
"""Auto-detect content type from URL or file extension."""
|
|
35
|
+
source_lower = source.lower()
|
|
36
|
+
|
|
37
|
+
# YouTube URLs
|
|
38
|
+
if any(domain in source_lower for domain in ["youtube.com", "youtu.be"]):
|
|
39
|
+
return "youtube"
|
|
40
|
+
|
|
41
|
+
# Web URLs
|
|
42
|
+
if source_lower.startswith(("http://", "https://")):
|
|
43
|
+
return "web"
|
|
44
|
+
|
|
45
|
+
# File extensions
|
|
46
|
+
ext = Path(source).suffix.lower()
|
|
47
|
+
if ext == ".pdf":
|
|
48
|
+
return "pdf"
|
|
49
|
+
if ext in (".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"):
|
|
50
|
+
return "audio"
|
|
51
|
+
if ext in (".md", ".txt", ".rst"):
|
|
52
|
+
return "markdown"
|
|
53
|
+
|
|
54
|
+
return "unknown"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class IngestEngine:
|
|
58
|
+
"""Processes content from various sources into the vector store."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, store: VectorStore, media_dir: str | Path = "") -> None:
|
|
61
|
+
self._store = store
|
|
62
|
+
self._media_dir = Path(media_dir) if media_dir else Path.home() / ".arkaos" / "media"
|
|
63
|
+
self._media_dir.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
|
|
65
|
+
def ingest(
|
|
66
|
+
self,
|
|
67
|
+
source: str,
|
|
68
|
+
source_type: str = "",
|
|
69
|
+
on_progress: Optional[ProgressCallback] = None,
|
|
70
|
+
metadata: dict | None = None,
|
|
71
|
+
) -> IngestResult:
|
|
72
|
+
"""Ingest content from any supported source.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
source: URL or file path.
|
|
76
|
+
source_type: youtube, pdf, audio, web, markdown. Auto-detected if empty.
|
|
77
|
+
on_progress: Callback(percent, message) for progress updates.
|
|
78
|
+
metadata: Extra metadata to attach to indexed chunks.
|
|
79
|
+
"""
|
|
80
|
+
if not source_type:
|
|
81
|
+
source_type = detect_source_type(source)
|
|
82
|
+
|
|
83
|
+
progress = on_progress or (lambda p, m: None)
|
|
84
|
+
progress(0, f"Starting {source_type} ingest...")
|
|
85
|
+
|
|
86
|
+
processors = {
|
|
87
|
+
"youtube": self._process_youtube,
|
|
88
|
+
"pdf": self._process_pdf,
|
|
89
|
+
"audio": self._process_audio,
|
|
90
|
+
"web": self._process_web,
|
|
91
|
+
"markdown": self._process_markdown,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
processor = processors.get(source_type)
|
|
95
|
+
if not processor:
|
|
96
|
+
return IngestResult(source=source, source_type=source_type, error=f"Unsupported type: {source_type}", success=False)
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
text, title = processor(source, progress)
|
|
100
|
+
except Exception as e:
|
|
101
|
+
return IngestResult(source=source, source_type=source_type, error=str(e), success=False)
|
|
102
|
+
|
|
103
|
+
if not text or len(text.strip()) < 50:
|
|
104
|
+
return IngestResult(source=source, source_type=source_type, error="Extracted text too short", success=False)
|
|
105
|
+
|
|
106
|
+
# Chunk and index
|
|
107
|
+
progress(75, "Chunking content...")
|
|
108
|
+
chunks = chunk_markdown(text, max_tokens=512, source=source)
|
|
109
|
+
|
|
110
|
+
progress(85, f"Indexing {len(chunks)} chunks...")
|
|
111
|
+
texts = [c.text for c in chunks]
|
|
112
|
+
headings = [c.heading for c in chunks]
|
|
113
|
+
count = self._store.index_chunks(
|
|
114
|
+
texts=texts,
|
|
115
|
+
headings=headings,
|
|
116
|
+
source=source,
|
|
117
|
+
metadata={"type": source_type, "title": title, **(metadata or {})},
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
progress(100, f"Done — {count} chunks indexed")
|
|
121
|
+
|
|
122
|
+
return IngestResult(
|
|
123
|
+
source=source,
|
|
124
|
+
source_type=source_type,
|
|
125
|
+
text_length=len(text),
|
|
126
|
+
chunks_created=count,
|
|
127
|
+
title=title,
|
|
128
|
+
success=True,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def _process_youtube(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
132
|
+
"""Download YouTube video and transcribe audio."""
|
|
133
|
+
try:
|
|
134
|
+
import yt_dlp
|
|
135
|
+
except ImportError:
|
|
136
|
+
raise RuntimeError("yt-dlp not installed. Run: pip install yt-dlp")
|
|
137
|
+
|
|
138
|
+
progress(5, "Fetching video info...")
|
|
139
|
+
|
|
140
|
+
# Download audio only
|
|
141
|
+
audio_path = str(self._media_dir / "yt_audio.wav")
|
|
142
|
+
ydl_opts = {
|
|
143
|
+
"format": "bestaudio/best",
|
|
144
|
+
"outtmpl": str(self._media_dir / "yt_audio.%(ext)s"),
|
|
145
|
+
"postprocessors": [{
|
|
146
|
+
"key": "FFmpegExtractAudio",
|
|
147
|
+
"preferredcodec": "wav",
|
|
148
|
+
"preferredquality": "16",
|
|
149
|
+
}],
|
|
150
|
+
"quiet": True,
|
|
151
|
+
"no_warnings": True,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
progress(10, "Downloading audio...")
|
|
155
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
156
|
+
info = ydl.extract_info(url, download=True)
|
|
157
|
+
title = info.get("title", "YouTube Video")
|
|
158
|
+
|
|
159
|
+
progress(35, "Transcribing audio...")
|
|
160
|
+
text = self._transcribe_audio(audio_path)
|
|
161
|
+
|
|
162
|
+
# Cleanup
|
|
163
|
+
try:
|
|
164
|
+
os.remove(audio_path)
|
|
165
|
+
except OSError:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
return text, title
|
|
169
|
+
|
|
170
|
+
def _process_pdf(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
171
|
+
"""Extract text from PDF."""
|
|
172
|
+
try:
|
|
173
|
+
import pdfplumber
|
|
174
|
+
except ImportError:
|
|
175
|
+
raise RuntimeError("pdfplumber not installed. Run: pip install pdfplumber")
|
|
176
|
+
|
|
177
|
+
progress(10, "Opening PDF...")
|
|
178
|
+
filepath = Path(path)
|
|
179
|
+
if not filepath.exists():
|
|
180
|
+
raise FileNotFoundError(f"PDF not found: {path}")
|
|
181
|
+
|
|
182
|
+
pages_text = []
|
|
183
|
+
with pdfplumber.open(filepath) as pdf:
|
|
184
|
+
total_pages = len(pdf.pages)
|
|
185
|
+
for i, page in enumerate(pdf.pages):
|
|
186
|
+
text = page.extract_text() or ""
|
|
187
|
+
pages_text.append(text)
|
|
188
|
+
pct = 10 + int((i / total_pages) * 60)
|
|
189
|
+
progress(pct, f"Extracting page {i + 1}/{total_pages}...")
|
|
190
|
+
|
|
191
|
+
title = filepath.stem.replace("-", " ").replace("_", " ")
|
|
192
|
+
return "\n\n".join(pages_text), title
|
|
193
|
+
|
|
194
|
+
def _process_audio(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
195
|
+
"""Transcribe audio file."""
|
|
196
|
+
progress(10, "Loading audio...")
|
|
197
|
+
filepath = Path(path)
|
|
198
|
+
if not filepath.exists():
|
|
199
|
+
raise FileNotFoundError(f"Audio not found: {path}")
|
|
200
|
+
|
|
201
|
+
progress(20, "Transcribing audio...")
|
|
202
|
+
text = self._transcribe_audio(str(filepath))
|
|
203
|
+
title = filepath.stem.replace("-", " ").replace("_", " ")
|
|
204
|
+
return text, title
|
|
205
|
+
|
|
206
|
+
def _process_web(self, url: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
207
|
+
"""Scrape web page content."""
|
|
208
|
+
try:
|
|
209
|
+
import requests
|
|
210
|
+
from bs4 import BeautifulSoup
|
|
211
|
+
except ImportError:
|
|
212
|
+
raise RuntimeError("beautifulsoup4 and requests not installed. Run: pip install beautifulsoup4 requests")
|
|
213
|
+
|
|
214
|
+
progress(10, "Fetching page...")
|
|
215
|
+
resp = requests.get(url, timeout=15, headers={
|
|
216
|
+
"User-Agent": "Mozilla/5.0 (ArkaOS Knowledge Indexer)"
|
|
217
|
+
})
|
|
218
|
+
resp.raise_for_status()
|
|
219
|
+
|
|
220
|
+
progress(40, "Parsing content...")
|
|
221
|
+
soup = BeautifulSoup(resp.text, "html.parser")
|
|
222
|
+
|
|
223
|
+
# Remove scripts, styles, nav, footer
|
|
224
|
+
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
|
225
|
+
tag.decompose()
|
|
226
|
+
|
|
227
|
+
# Get title
|
|
228
|
+
title = soup.title.string if soup.title else url
|
|
229
|
+
|
|
230
|
+
# Get main content (article > main > body)
|
|
231
|
+
main = soup.find("article") or soup.find("main") or soup.find("body")
|
|
232
|
+
text = main.get_text(separator="\n\n", strip=True) if main else soup.get_text(separator="\n\n", strip=True)
|
|
233
|
+
|
|
234
|
+
# Clean up whitespace
|
|
235
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
236
|
+
|
|
237
|
+
return text, title
|
|
238
|
+
|
|
239
|
+
def _process_markdown(self, path: str, progress: ProgressCallback) -> tuple[str, str]:
|
|
240
|
+
"""Read markdown/text file directly."""
|
|
241
|
+
progress(10, "Reading file...")
|
|
242
|
+
filepath = Path(path)
|
|
243
|
+
if not filepath.exists():
|
|
244
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
245
|
+
|
|
246
|
+
text = filepath.read_text(encoding="utf-8")
|
|
247
|
+
title = filepath.stem.replace("-", " ").replace("_", " ")
|
|
248
|
+
return text, title
|
|
249
|
+
|
|
250
|
+
def _transcribe_audio(self, audio_path: str) -> str:
|
|
251
|
+
"""Transcribe audio using faster-whisper (or fallback)."""
|
|
252
|
+
try:
|
|
253
|
+
from faster_whisper import WhisperModel
|
|
254
|
+
model = WhisperModel("base", device="cpu", compute_type="int8")
|
|
255
|
+
segments, _ = model.transcribe(audio_path, beam_size=5)
|
|
256
|
+
return " ".join(segment.text for segment in segments)
|
|
257
|
+
except ImportError:
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
import whisper
|
|
262
|
+
model = whisper.load_model("base")
|
|
263
|
+
result = model.transcribe(audio_path)
|
|
264
|
+
return result["text"]
|
|
265
|
+
except ImportError:
|
|
266
|
+
raise RuntimeError(
|
|
267
|
+
"No transcription engine available. Install one:\n"
|
|
268
|
+
" pip install faster-whisper (recommended, lighter)\n"
|
|
269
|
+
" pip install openai-whisper (original, heavier)"
|
|
270
|
+
)
|