brainlayer 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brainlayer/__init__.py +3 -0
- brainlayer/cli/__init__.py +1545 -0
- brainlayer/cli/wizard.py +132 -0
- brainlayer/cli_new.py +151 -0
- brainlayer/client.py +164 -0
- brainlayer/clustering.py +736 -0
- brainlayer/daemon.py +1105 -0
- brainlayer/dashboard/README.md +129 -0
- brainlayer/dashboard/__init__.py +5 -0
- brainlayer/dashboard/app.py +151 -0
- brainlayer/dashboard/search.py +229 -0
- brainlayer/dashboard/views.py +230 -0
- brainlayer/embeddings.py +131 -0
- brainlayer/engine.py +550 -0
- brainlayer/index_new.py +87 -0
- brainlayer/mcp/__init__.py +1558 -0
- brainlayer/migrate.py +205 -0
- brainlayer/paths.py +43 -0
- brainlayer/pipeline/__init__.py +47 -0
- brainlayer/pipeline/analyze_communication.py +508 -0
- brainlayer/pipeline/brain_graph.py +567 -0
- brainlayer/pipeline/chat_tags.py +63 -0
- brainlayer/pipeline/chunk.py +422 -0
- brainlayer/pipeline/classify.py +472 -0
- brainlayer/pipeline/cluster_sampling.py +73 -0
- brainlayer/pipeline/enrichment.py +810 -0
- brainlayer/pipeline/extract.py +66 -0
- brainlayer/pipeline/extract_claude_desktop.py +149 -0
- brainlayer/pipeline/extract_corrections.py +231 -0
- brainlayer/pipeline/extract_markdown.py +195 -0
- brainlayer/pipeline/extract_whatsapp.py +227 -0
- brainlayer/pipeline/git_overlay.py +301 -0
- brainlayer/pipeline/longitudinal_analyzer.py +568 -0
- brainlayer/pipeline/obsidian_export.py +455 -0
- brainlayer/pipeline/operation_grouping.py +486 -0
- brainlayer/pipeline/plan_linking.py +313 -0
- brainlayer/pipeline/sanitize.py +549 -0
- brainlayer/pipeline/semantic_style.py +574 -0
- brainlayer/pipeline/session_enrichment.py +472 -0
- brainlayer/pipeline/style_embed.py +67 -0
- brainlayer/pipeline/style_index.py +139 -0
- brainlayer/pipeline/temporal_chains.py +203 -0
- brainlayer/pipeline/time_batcher.py +248 -0
- brainlayer/pipeline/unified_timeline.py +569 -0
- brainlayer/storage.py +66 -0
- brainlayer/store.py +155 -0
- brainlayer/taxonomy.json +80 -0
- brainlayer/vector_store.py +1891 -0
- brainlayer-1.0.0.dist-info/METADATA +313 -0
- brainlayer-1.0.0.dist-info/RECORD +53 -0
- brainlayer-1.0.0.dist-info/WHEEL +4 -0
- brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
- brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Stage 1: Extract system prompts and detect conversation continuations."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterator
|
|
6
|
+
|
|
7
|
+
import orjson
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def hash_content(content: str) -> str:
|
|
11
|
+
"""SHA-256 hash for content-addressable storage."""
|
|
12
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse_jsonl(file_path: Path) -> Iterator[dict]:
|
|
16
|
+
"""Parse a JSONL file, yielding each line as a dict."""
|
|
17
|
+
with open(file_path, "rb") as f:
|
|
18
|
+
for line in f:
|
|
19
|
+
line = line.strip()
|
|
20
|
+
if not line:
|
|
21
|
+
continue
|
|
22
|
+
try:
|
|
23
|
+
yield orjson.loads(line)
|
|
24
|
+
except orjson.JSONDecodeError:
|
|
25
|
+
continue # Skip malformed lines
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract_system_prompts(conversations_dir: Path) -> dict[str, str]:
|
|
29
|
+
"""
|
|
30
|
+
Extract and deduplicate system prompts from all conversations.
|
|
31
|
+
|
|
32
|
+
Returns: {hash: prompt_content} mapping for content-addressable storage.
|
|
33
|
+
"""
|
|
34
|
+
prompts: dict[str, str] = {}
|
|
35
|
+
|
|
36
|
+
for jsonl_file in conversations_dir.rglob("*.jsonl"):
|
|
37
|
+
for entry in parse_jsonl(jsonl_file):
|
|
38
|
+
# First user message often contains system prompt
|
|
39
|
+
if entry.get("type") == "user":
|
|
40
|
+
message = entry.get("message", {})
|
|
41
|
+
content = message.get("content", "")
|
|
42
|
+
|
|
43
|
+
# Heuristic: system prompts are typically >2000 chars
|
|
44
|
+
# and contain CLAUDE.md or system instructions
|
|
45
|
+
if len(content) > 2000 and ("CLAUDE.md" in content or "system" in content.lower()):
|
|
46
|
+
content_hash = hash_content(content)
|
|
47
|
+
if content_hash not in prompts:
|
|
48
|
+
prompts[content_hash] = content
|
|
49
|
+
|
|
50
|
+
return prompts
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def detect_continuations(conversations_dir: Path) -> list[list[Path]]:
|
|
54
|
+
"""
|
|
55
|
+
Detect conversation continuations across multiple JSONL files.
|
|
56
|
+
|
|
57
|
+
Uses:
|
|
58
|
+
- Session ID matching
|
|
59
|
+
- Temporal proximity (within 30 min)
|
|
60
|
+
- Same project directory
|
|
61
|
+
|
|
62
|
+
Returns: List of continuation chains (each chain is a list of file paths).
|
|
63
|
+
"""
|
|
64
|
+
# TODO: Implement continuation detection
|
|
65
|
+
# For now, treat each file as independent
|
|
66
|
+
return [[f] for f in conversations_dir.rglob("*.jsonl")]
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Extract Claude chat transcripts from desktop/web app IndexedDB.
|
|
2
|
+
|
|
3
|
+
Claude desktop app stores conversations in LevelDB (IndexedDB backend).
|
|
4
|
+
This module extracts chat data for analysis.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Iterator
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import plyvel
|
|
13
|
+
|
|
14
|
+
HAS_PLYVEL = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
HAS_PLYVEL = False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_claude_indexeddb_path() -> Path:
|
|
20
|
+
"""Get path to Claude desktop app's IndexedDB."""
|
|
21
|
+
return Path.home() / "Library/Application Support/Claude/IndexedDB/https_claude.ai_0.indexeddb.leveldb"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def extract_with_plyvel(db_path: Path) -> Iterator[dict]:
|
|
25
|
+
"""Extract data using plyvel (requires: pip install plyvel)."""
|
|
26
|
+
if not HAS_PLYVEL:
|
|
27
|
+
raise ImportError("plyvel not installed. Run: pip install plyvel")
|
|
28
|
+
|
|
29
|
+
db = plyvel.DB(str(db_path), create_if_missing=False)
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
for key, value in db:
|
|
33
|
+
# IndexedDB stores data with metadata prefixes
|
|
34
|
+
# We need to parse the binary format
|
|
35
|
+
try:
|
|
36
|
+
# Try to decode as JSON (some values are JSON strings)
|
|
37
|
+
decoded = value.decode("utf-8", errors="ignore")
|
|
38
|
+
if decoded.startswith("{") or decoded.startswith("["):
|
|
39
|
+
data = json.loads(decoded)
|
|
40
|
+
yield data
|
|
41
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
42
|
+
continue
|
|
43
|
+
finally:
|
|
44
|
+
db.close()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def extract_with_chrome_devtools() -> Iterator[dict]:
|
|
48
|
+
"""
|
|
49
|
+
Extract using Chrome DevTools Protocol (alternative method).
|
|
50
|
+
|
|
51
|
+
This requires the Claude desktop app to be running and accessible.
|
|
52
|
+
More reliable than parsing LevelDB directly.
|
|
53
|
+
"""
|
|
54
|
+
# TODO: Implement CDP extraction
|
|
55
|
+
# This would connect to the running app and extract via JavaScript
|
|
56
|
+
raise NotImplementedError("Chrome DevTools extraction not yet implemented")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def extract_conversations_from_export(export_path: Path) -> Iterator[dict]:
|
|
60
|
+
"""
|
|
61
|
+
Extract from Claude chat export (if available).
|
|
62
|
+
|
|
63
|
+
Claude.ai may offer export functionality - this handles that format.
|
|
64
|
+
"""
|
|
65
|
+
if not export_path.exists():
|
|
66
|
+
raise FileNotFoundError(f"Export file not found: {export_path}")
|
|
67
|
+
|
|
68
|
+
with open(export_path, "r") as f:
|
|
69
|
+
data = json.load(f)
|
|
70
|
+
|
|
71
|
+
# Format depends on Claude's export structure
|
|
72
|
+
# Assuming it's a list of conversations
|
|
73
|
+
if isinstance(data, list):
|
|
74
|
+
yield from data
|
|
75
|
+
elif isinstance(data, dict) and "conversations" in data:
|
|
76
|
+
yield from data["conversations"]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def extract_claude_chats(method: str = "manual") -> Iterator[dict]:
|
|
80
|
+
"""
|
|
81
|
+
Extract Claude chat transcripts.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
method: Extraction method
|
|
85
|
+
- "manual": User manually exports from Claude.ai
|
|
86
|
+
- "plyvel": Direct LevelDB access (requires plyvel)
|
|
87
|
+
- "cdp": Chrome DevTools Protocol (requires running app)
|
|
88
|
+
|
|
89
|
+
Yields:
|
|
90
|
+
Chat conversation dictionaries
|
|
91
|
+
"""
|
|
92
|
+
if method == "manual":
|
|
93
|
+
# Guide user to export manually
|
|
94
|
+
print("\n" + "=" * 70)
|
|
95
|
+
print("MANUAL EXPORT INSTRUCTIONS")
|
|
96
|
+
print("=" * 70)
|
|
97
|
+
print("\n1. Open Claude.ai in your browser")
|
|
98
|
+
print("2. Go to Settings > Data & Privacy")
|
|
99
|
+
print("3. Click 'Export my data'")
|
|
100
|
+
print("4. Download the export file")
|
|
101
|
+
print("5. Save it to: ~/.local/share/brainlayer/claude-export.json")
|
|
102
|
+
print("\nAlternatively, copy conversations manually from the web interface.")
|
|
103
|
+
print("=" * 70 + "\n")
|
|
104
|
+
|
|
105
|
+
export_path = Path.home() / ".local/share/brainlayer/claude-export.json"
|
|
106
|
+
if export_path.exists():
|
|
107
|
+
yield from extract_conversations_from_export(export_path)
|
|
108
|
+
else:
|
|
109
|
+
print(f"Waiting for export at: {export_path}")
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
elif method == "plyvel":
|
|
113
|
+
db_path = get_claude_indexeddb_path()
|
|
114
|
+
if not db_path.exists():
|
|
115
|
+
raise FileNotFoundError(f"Claude IndexedDB not found at: {db_path}")
|
|
116
|
+
yield from extract_with_plyvel(db_path)
|
|
117
|
+
|
|
118
|
+
elif method == "cdp":
|
|
119
|
+
yield from extract_with_chrome_devtools()
|
|
120
|
+
|
|
121
|
+
else:
|
|
122
|
+
raise ValueError(f"Unknown extraction method: {method}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def format_claude_chat_for_pipeline(conversation: dict) -> dict:
|
|
126
|
+
"""
|
|
127
|
+
Format Claude chat data to match brainlayer pipeline format.
|
|
128
|
+
|
|
129
|
+
Converts Claude's chat format to the JSONL format used by Claude Code.
|
|
130
|
+
"""
|
|
131
|
+
# This depends on the actual format from Claude.ai
|
|
132
|
+
# Placeholder structure:
|
|
133
|
+
return {
|
|
134
|
+
"type": "conversation",
|
|
135
|
+
"id": conversation.get("id"),
|
|
136
|
+
"created_at": conversation.get("created_at"),
|
|
137
|
+
"messages": [
|
|
138
|
+
{
|
|
139
|
+
"role": msg.get("role", "user"),
|
|
140
|
+
"content": msg.get("content", ""),
|
|
141
|
+
"timestamp": msg.get("timestamp"),
|
|
142
|
+
}
|
|
143
|
+
for msg in conversation.get("messages", [])
|
|
144
|
+
],
|
|
145
|
+
"metadata": {
|
|
146
|
+
"source": "claude_desktop",
|
|
147
|
+
"model": conversation.get("model"),
|
|
148
|
+
},
|
|
149
|
+
}
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Extract correction patterns from Claude/Gemini conversations.
|
|
2
|
+
|
|
3
|
+
Identifies where AI assistants helped draft text and user made corrections,
|
|
4
|
+
to learn what the user consistently changes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_claude_export_conversations(export_path: Path) -> Iterator[dict]:
|
|
14
|
+
"""
|
|
15
|
+
Extract conversations from Claude.ai export.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
export_path: Path to extracted conversations.json
|
|
19
|
+
|
|
20
|
+
Yields:
|
|
21
|
+
Conversation dictionaries with messages
|
|
22
|
+
"""
|
|
23
|
+
with open(export_path, "r") as f:
|
|
24
|
+
data = json.load(f)
|
|
25
|
+
|
|
26
|
+
for conv in data:
|
|
27
|
+
yield {
|
|
28
|
+
"id": conv.get("uuid"),
|
|
29
|
+
"name": conv.get("name", "Untitled"),
|
|
30
|
+
"created_at": conv.get("created_at"),
|
|
31
|
+
"messages": [
|
|
32
|
+
{
|
|
33
|
+
"role": "user" if msg.get("sender") == "human" else "assistant",
|
|
34
|
+
"content": msg.get("text", ""),
|
|
35
|
+
"created_at": msg.get("created_at"),
|
|
36
|
+
}
|
|
37
|
+
for msg in conv.get("chat_messages", [])
|
|
38
|
+
],
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def is_draft_request(text: str) -> bool:
|
|
43
|
+
"""Detect if user is asking for help drafting text."""
|
|
44
|
+
patterns = [
|
|
45
|
+
r"write (me )?a",
|
|
46
|
+
r"draft (me )?a",
|
|
47
|
+
r"help me write",
|
|
48
|
+
r"can you write",
|
|
49
|
+
r"make (me )?a",
|
|
50
|
+
r"does this sound",
|
|
51
|
+
r"is this (good|right|ok)",
|
|
52
|
+
r"how does this sound",
|
|
53
|
+
r"rewrite this",
|
|
54
|
+
r"improve this",
|
|
55
|
+
r"fix this",
|
|
56
|
+
r"better way to say",
|
|
57
|
+
]
|
|
58
|
+
text_lower = text.lower()
|
|
59
|
+
return any(re.search(pattern, text_lower) for pattern in patterns)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def is_correction(user_msg: str, prev_assistant_msg: str) -> bool:
|
|
63
|
+
"""
|
|
64
|
+
Detect if user message is correcting the assistant's draft.
|
|
65
|
+
|
|
66
|
+
Patterns:
|
|
67
|
+
- "No, make it..."
|
|
68
|
+
- "Actually, change it to..."
|
|
69
|
+
- "Too formal, use..."
|
|
70
|
+
- User provides alternative text after assistant's draft
|
|
71
|
+
"""
|
|
72
|
+
user_lower = user_msg.lower()
|
|
73
|
+
|
|
74
|
+
correction_markers = [
|
|
75
|
+
"no,",
|
|
76
|
+
"actually,",
|
|
77
|
+
"instead",
|
|
78
|
+
"change it",
|
|
79
|
+
"make it",
|
|
80
|
+
"too formal",
|
|
81
|
+
"too long",
|
|
82
|
+
"too short",
|
|
83
|
+
"simpler",
|
|
84
|
+
"more casual",
|
|
85
|
+
"more professional",
|
|
86
|
+
"shorter",
|
|
87
|
+
"just say",
|
|
88
|
+
"better:",
|
|
89
|
+
"fix:",
|
|
90
|
+
"use this:",
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
# Check for correction markers
|
|
94
|
+
if any(marker in user_lower for marker in correction_markers):
|
|
95
|
+
return True
|
|
96
|
+
|
|
97
|
+
# Check if user provides alternative text (quoted or after colon)
|
|
98
|
+
if '"' in user_msg or "'" in user_msg or ":" in user_msg:
|
|
99
|
+
# User might be providing alternative text
|
|
100
|
+
return True
|
|
101
|
+
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def extract_correction_pairs(conversation: dict) -> list[dict]:
|
|
106
|
+
"""
|
|
107
|
+
Extract (draft, correction) pairs from a conversation.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
List of correction dictionaries with:
|
|
111
|
+
- original_draft: AI's draft
|
|
112
|
+
- user_correction: User's correction/feedback
|
|
113
|
+
- context: What they were drafting
|
|
114
|
+
"""
|
|
115
|
+
corrections = []
|
|
116
|
+
messages = conversation["messages"]
|
|
117
|
+
|
|
118
|
+
for i in range(len(messages) - 1):
|
|
119
|
+
current = messages[i]
|
|
120
|
+
next_msg = messages[i + 1] if i + 1 < len(messages) else None
|
|
121
|
+
|
|
122
|
+
# Look for pattern: User asks for draft → AI provides → User corrects
|
|
123
|
+
if current["role"] == "user" and is_draft_request(current["content"]):
|
|
124
|
+
# Find AI's response
|
|
125
|
+
if next_msg and next_msg["role"] == "assistant":
|
|
126
|
+
ai_draft = next_msg["content"]
|
|
127
|
+
|
|
128
|
+
# Check if user corrects it
|
|
129
|
+
if i + 2 < len(messages):
|
|
130
|
+
user_response = messages[i + 2]
|
|
131
|
+
if user_response["role"] == "user" and is_correction(user_response["content"], ai_draft):
|
|
132
|
+
corrections.append(
|
|
133
|
+
{
|
|
134
|
+
"context": current["content"],
|
|
135
|
+
"ai_draft": ai_draft,
|
|
136
|
+
"user_correction": user_response["content"],
|
|
137
|
+
"conversation_name": conversation["name"],
|
|
138
|
+
}
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return corrections
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def analyze_correction_patterns(corrections: list[dict]) -> dict:
|
|
145
|
+
"""
|
|
146
|
+
Analyze what user consistently changes in AI drafts.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Dictionary with patterns like:
|
|
150
|
+
- length_changes: Shorter/longer
|
|
151
|
+
- formality_changes: More/less formal
|
|
152
|
+
- common_edits: Specific words/phrases changed
|
|
153
|
+
- tone_adjustments: Casual/professional
|
|
154
|
+
"""
|
|
155
|
+
if not corrections:
|
|
156
|
+
return {}
|
|
157
|
+
|
|
158
|
+
patterns = {
|
|
159
|
+
"total_corrections": len(corrections),
|
|
160
|
+
"makes_shorter": 0,
|
|
161
|
+
"makes_longer": 0,
|
|
162
|
+
"removes_formality": 0,
|
|
163
|
+
"adds_emojis": 0,
|
|
164
|
+
"simplifies_language": 0,
|
|
165
|
+
"common_feedback": [],
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
for corr in corrections:
|
|
169
|
+
draft = corr["ai_draft"]
|
|
170
|
+
feedback = corr["user_correction"].lower()
|
|
171
|
+
|
|
172
|
+
# Length changes
|
|
173
|
+
if "shorter" in feedback or "too long" in feedback or "brief" in feedback:
|
|
174
|
+
patterns["makes_shorter"] += 1
|
|
175
|
+
if "longer" in feedback or "more detail" in feedback:
|
|
176
|
+
patterns["makes_longer"] += 1
|
|
177
|
+
|
|
178
|
+
# Formality
|
|
179
|
+
if any(word in feedback for word in ["casual", "informal", "too formal", "stiff"]):
|
|
180
|
+
patterns["removes_formality"] += 1
|
|
181
|
+
|
|
182
|
+
# Emojis
|
|
183
|
+
if any(char in corr["user_correction"] for char in ["😂", "😊", "🔥", "💪", "👍", "❤️"]):
|
|
184
|
+
patterns["adds_emojis"] += 1
|
|
185
|
+
|
|
186
|
+
# Simplification
|
|
187
|
+
if any(word in feedback for word in ["simpler", "simple", "easier", "plain"]):
|
|
188
|
+
patterns["simplifies_language"] += 1
|
|
189
|
+
|
|
190
|
+
# Extract common feedback phrases
|
|
191
|
+
feedback_phrases = [
|
|
192
|
+
"too formal",
|
|
193
|
+
"too long",
|
|
194
|
+
"too short",
|
|
195
|
+
"more casual",
|
|
196
|
+
"less formal",
|
|
197
|
+
"simpler",
|
|
198
|
+
"add emoji",
|
|
199
|
+
"remove",
|
|
200
|
+
]
|
|
201
|
+
for phrase in feedback_phrases:
|
|
202
|
+
if phrase in feedback:
|
|
203
|
+
patterns["common_feedback"].append(phrase)
|
|
204
|
+
|
|
205
|
+
return patterns
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def extract_user_final_versions(corrections: list[dict]) -> list[str]:
|
|
209
|
+
"""
|
|
210
|
+
Extract the final versions user actually used/approved.
|
|
211
|
+
|
|
212
|
+
These are the gold standard examples of their style.
|
|
213
|
+
"""
|
|
214
|
+
final_versions = []
|
|
215
|
+
|
|
216
|
+
for corr in corrections:
|
|
217
|
+
# Try to extract quoted text from user's correction
|
|
218
|
+
user_text = corr["user_correction"]
|
|
219
|
+
|
|
220
|
+
# Look for quoted text
|
|
221
|
+
quoted = re.findall(r'"([^"]+)"', user_text)
|
|
222
|
+
if quoted:
|
|
223
|
+
final_versions.extend(quoted)
|
|
224
|
+
|
|
225
|
+
# Look for text after colon
|
|
226
|
+
if ":" in user_text:
|
|
227
|
+
parts = user_text.split(":", 1)
|
|
228
|
+
if len(parts) > 1:
|
|
229
|
+
final_versions.append(parts[1].strip())
|
|
230
|
+
|
|
231
|
+
return final_versions
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Extract and classify markdown files for indexing."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterator
|
|
6
|
+
|
|
7
|
+
from .classify import ClassifiedContent, ContentType, ContentValue
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def find_markdown_files(
|
|
11
|
+
root: Path, patterns: list[str] | None = None, exclude: list[str] | None = None
|
|
12
|
+
) -> Iterator[Path]:
|
|
13
|
+
"""
|
|
14
|
+
Find markdown files matching glob patterns.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
root: Root directory to search
|
|
18
|
+
patterns: Glob patterns to match (default: ["**/*.md"])
|
|
19
|
+
exclude: Directory names to exclude (default: common non-content dirs)
|
|
20
|
+
|
|
21
|
+
Yields:
|
|
22
|
+
Path objects for matching markdown files
|
|
23
|
+
"""
|
|
24
|
+
if patterns is None:
|
|
25
|
+
patterns = ["**/*.md"]
|
|
26
|
+
if exclude is None:
|
|
27
|
+
exclude = ["node_modules", ".git", "dist", "__pycache__", ".venv", "venv"]
|
|
28
|
+
|
|
29
|
+
exclude_set = set(exclude)
|
|
30
|
+
|
|
31
|
+
for pattern in patterns:
|
|
32
|
+
for path in root.glob(pattern):
|
|
33
|
+
# Skip excluded directories
|
|
34
|
+
if any(part in exclude_set for part in path.parts):
|
|
35
|
+
continue
|
|
36
|
+
if path.is_file():
|
|
37
|
+
yield path
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def parse_markdown(file_path: Path) -> list[dict]:
|
|
41
|
+
"""
|
|
42
|
+
Parse a markdown file into sections.
|
|
43
|
+
|
|
44
|
+
Splits on h2 headers (## ) to create logical chunks.
|
|
45
|
+
Each section includes header hierarchy for context.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of dicts with keys: content, header, parent_headers, tags
|
|
49
|
+
"""
|
|
50
|
+
content = file_path.read_text(encoding="utf-8", errors="replace")
|
|
51
|
+
sections = []
|
|
52
|
+
|
|
53
|
+
# Extract frontmatter if present
|
|
54
|
+
frontmatter = {}
|
|
55
|
+
if content.startswith("---"):
|
|
56
|
+
parts = content.split("---", 2)
|
|
57
|
+
if len(parts) >= 3:
|
|
58
|
+
# Simple YAML parsing for common keys
|
|
59
|
+
for line in parts[1].strip().split("\n"):
|
|
60
|
+
if ":" in line:
|
|
61
|
+
key, _, value = line.partition(":")
|
|
62
|
+
frontmatter[key.strip()] = value.strip().strip("\"'")
|
|
63
|
+
content = parts[2]
|
|
64
|
+
|
|
65
|
+
# Extract tags from content (#tag patterns not in code blocks)
|
|
66
|
+
tags = set()
|
|
67
|
+
for match in re.finditer(r"(?<!\S)#([a-zA-Z][a-zA-Z0-9_-]*)", content):
|
|
68
|
+
tag = match.group(1).lower()
|
|
69
|
+
# Skip common markdown/code patterns
|
|
70
|
+
if tag not in ("include", "define", "ifdef", "endif", "pragma", "import"):
|
|
71
|
+
tags.add(tag)
|
|
72
|
+
|
|
73
|
+
# Split by h2 headers
|
|
74
|
+
h1_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
|
|
75
|
+
h1_title = h1_match.group(1) if h1_match else file_path.stem
|
|
76
|
+
|
|
77
|
+
# Find all h2 sections
|
|
78
|
+
h2_pattern = r"^##\s+(.+)$"
|
|
79
|
+
h2_positions = [(m.start(), m.group(1)) for m in re.finditer(h2_pattern, content, re.MULTILINE)]
|
|
80
|
+
|
|
81
|
+
if not h2_positions:
|
|
82
|
+
# No h2 headers - treat entire file as one section
|
|
83
|
+
sections.append(
|
|
84
|
+
{
|
|
85
|
+
"content": content.strip(),
|
|
86
|
+
"header": h1_title,
|
|
87
|
+
"parent_headers": [],
|
|
88
|
+
"tags": list(tags),
|
|
89
|
+
"frontmatter": frontmatter,
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
# Content before first h2
|
|
94
|
+
if h2_positions[0][0] > 0:
|
|
95
|
+
intro = content[: h2_positions[0][0]].strip()
|
|
96
|
+
if intro and len(intro) > 50: # Skip trivial intros
|
|
97
|
+
sections.append(
|
|
98
|
+
{
|
|
99
|
+
"content": intro,
|
|
100
|
+
"header": h1_title,
|
|
101
|
+
"parent_headers": [],
|
|
102
|
+
"tags": list(tags),
|
|
103
|
+
"frontmatter": frontmatter,
|
|
104
|
+
}
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Each h2 section
|
|
108
|
+
for i, (pos, header) in enumerate(h2_positions):
|
|
109
|
+
end_pos = h2_positions[i + 1][0] if i + 1 < len(h2_positions) else len(content)
|
|
110
|
+
section_content = content[pos:end_pos].strip()
|
|
111
|
+
|
|
112
|
+
sections.append(
|
|
113
|
+
{
|
|
114
|
+
"content": section_content,
|
|
115
|
+
"header": header,
|
|
116
|
+
"parent_headers": [h1_title],
|
|
117
|
+
"tags": list(tags),
|
|
118
|
+
"frontmatter": frontmatter,
|
|
119
|
+
}
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return sections
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def classify_by_path(file_path: Path) -> tuple[ContentType, ContentValue]:
|
|
126
|
+
"""
|
|
127
|
+
Classify a markdown file based on its path.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Tuple of (ContentType, ContentValue) based on path patterns
|
|
131
|
+
"""
|
|
132
|
+
path_str = str(file_path).lower()
|
|
133
|
+
name = file_path.name.lower()
|
|
134
|
+
|
|
135
|
+
# CLAUDE.md files are high-value project config
|
|
136
|
+
if name == "claude.md":
|
|
137
|
+
return ContentType.PROJECT_CONFIG, ContentValue.HIGH
|
|
138
|
+
|
|
139
|
+
# Learnings are curated high-value content
|
|
140
|
+
if "/learnings/" in path_str or "learnings/" in path_str:
|
|
141
|
+
return ContentType.LEARNING, ContentValue.HIGH
|
|
142
|
+
|
|
143
|
+
# Skills are high-value reference
|
|
144
|
+
if "/skills/" in path_str or "skills/" in path_str:
|
|
145
|
+
return ContentType.SKILL, ContentValue.HIGH
|
|
146
|
+
|
|
147
|
+
# Research docs
|
|
148
|
+
if "/research/" in path_str or "research/" in path_str:
|
|
149
|
+
return ContentType.RESEARCH, ContentValue.HIGH
|
|
150
|
+
|
|
151
|
+
# PRD archives - medium value
|
|
152
|
+
if "/prd" in path_str or "prd-" in path_str or "prd_" in path_str:
|
|
153
|
+
return ContentType.PRD_ARCHIVE, ContentValue.MEDIUM
|
|
154
|
+
|
|
155
|
+
# Verification rounds - low value (voluminous)
|
|
156
|
+
if "/verification" in path_str or "verification-" in path_str:
|
|
157
|
+
return ContentType.VERIFICATION, ContentValue.LOW
|
|
158
|
+
|
|
159
|
+
# Default: documentation
|
|
160
|
+
return ContentType.DOCUMENTATION, ContentValue.MEDIUM
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def extract_markdown_content(file_path: Path) -> list[ClassifiedContent]:
|
|
164
|
+
"""
|
|
165
|
+
Extract and classify content from a markdown file.
|
|
166
|
+
|
|
167
|
+
Combines parsing and classification into ready-to-chunk content.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
List of ClassifiedContent objects for each section
|
|
171
|
+
"""
|
|
172
|
+
content_type, value = classify_by_path(file_path)
|
|
173
|
+
sections = parse_markdown(file_path)
|
|
174
|
+
|
|
175
|
+
results = []
|
|
176
|
+
for section in sections:
|
|
177
|
+
# Build context string with header hierarchy
|
|
178
|
+
header_context = " > ".join(section["parent_headers"] + [section["header"]])
|
|
179
|
+
|
|
180
|
+
results.append(
|
|
181
|
+
ClassifiedContent(
|
|
182
|
+
content=section["content"],
|
|
183
|
+
content_type=content_type,
|
|
184
|
+
value=value,
|
|
185
|
+
metadata={
|
|
186
|
+
"source_file": str(file_path),
|
|
187
|
+
"header": section["header"],
|
|
188
|
+
"header_context": header_context,
|
|
189
|
+
"tags": section["tags"],
|
|
190
|
+
"frontmatter": section.get("frontmatter", {}),
|
|
191
|
+
},
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return results
|