brainlayer 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. brainlayer/__init__.py +3 -0
  2. brainlayer/cli/__init__.py +1545 -0
  3. brainlayer/cli/wizard.py +132 -0
  4. brainlayer/cli_new.py +151 -0
  5. brainlayer/client.py +164 -0
  6. brainlayer/clustering.py +736 -0
  7. brainlayer/daemon.py +1105 -0
  8. brainlayer/dashboard/README.md +129 -0
  9. brainlayer/dashboard/__init__.py +5 -0
  10. brainlayer/dashboard/app.py +151 -0
  11. brainlayer/dashboard/search.py +229 -0
  12. brainlayer/dashboard/views.py +230 -0
  13. brainlayer/embeddings.py +131 -0
  14. brainlayer/engine.py +550 -0
  15. brainlayer/index_new.py +87 -0
  16. brainlayer/mcp/__init__.py +1558 -0
  17. brainlayer/migrate.py +205 -0
  18. brainlayer/paths.py +43 -0
  19. brainlayer/pipeline/__init__.py +47 -0
  20. brainlayer/pipeline/analyze_communication.py +508 -0
  21. brainlayer/pipeline/brain_graph.py +567 -0
  22. brainlayer/pipeline/chat_tags.py +63 -0
  23. brainlayer/pipeline/chunk.py +422 -0
  24. brainlayer/pipeline/classify.py +472 -0
  25. brainlayer/pipeline/cluster_sampling.py +73 -0
  26. brainlayer/pipeline/enrichment.py +810 -0
  27. brainlayer/pipeline/extract.py +66 -0
  28. brainlayer/pipeline/extract_claude_desktop.py +149 -0
  29. brainlayer/pipeline/extract_corrections.py +231 -0
  30. brainlayer/pipeline/extract_markdown.py +195 -0
  31. brainlayer/pipeline/extract_whatsapp.py +227 -0
  32. brainlayer/pipeline/git_overlay.py +301 -0
  33. brainlayer/pipeline/longitudinal_analyzer.py +568 -0
  34. brainlayer/pipeline/obsidian_export.py +455 -0
  35. brainlayer/pipeline/operation_grouping.py +486 -0
  36. brainlayer/pipeline/plan_linking.py +313 -0
  37. brainlayer/pipeline/sanitize.py +549 -0
  38. brainlayer/pipeline/semantic_style.py +574 -0
  39. brainlayer/pipeline/session_enrichment.py +472 -0
  40. brainlayer/pipeline/style_embed.py +67 -0
  41. brainlayer/pipeline/style_index.py +139 -0
  42. brainlayer/pipeline/temporal_chains.py +203 -0
  43. brainlayer/pipeline/time_batcher.py +248 -0
  44. brainlayer/pipeline/unified_timeline.py +569 -0
  45. brainlayer/storage.py +66 -0
  46. brainlayer/store.py +155 -0
  47. brainlayer/taxonomy.json +80 -0
  48. brainlayer/vector_store.py +1891 -0
  49. brainlayer-1.0.0.dist-info/METADATA +313 -0
  50. brainlayer-1.0.0.dist-info/RECORD +53 -0
  51. brainlayer-1.0.0.dist-info/WHEEL +4 -0
  52. brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
  53. brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,66 @@
1
+ """Stage 1: Extract system prompts and detect conversation continuations."""
2
+
3
+ import hashlib
4
+ from pathlib import Path
5
+ from typing import Iterator
6
+
7
+ import orjson
8
+
9
+
10
+ def hash_content(content: str) -> str:
11
+ """SHA-256 hash for content-addressable storage."""
12
+ return hashlib.sha256(content.encode()).hexdigest()
13
+
14
+
15
+ def parse_jsonl(file_path: Path) -> Iterator[dict]:
16
+ """Parse a JSONL file, yielding each line as a dict."""
17
+ with open(file_path, "rb") as f:
18
+ for line in f:
19
+ line = line.strip()
20
+ if not line:
21
+ continue
22
+ try:
23
+ yield orjson.loads(line)
24
+ except orjson.JSONDecodeError:
25
+ continue # Skip malformed lines
26
+
27
+
28
+ def extract_system_prompts(conversations_dir: Path) -> dict[str, str]:
29
+ """
30
+ Extract and deduplicate system prompts from all conversations.
31
+
32
+ Returns: {hash: prompt_content} mapping for content-addressable storage.
33
+ """
34
+ prompts: dict[str, str] = {}
35
+
36
+ for jsonl_file in conversations_dir.rglob("*.jsonl"):
37
+ for entry in parse_jsonl(jsonl_file):
38
+ # First user message often contains system prompt
39
+ if entry.get("type") == "user":
40
+ message = entry.get("message", {})
41
+ content = message.get("content", "")
42
+
43
+ # Heuristic: system prompts are typically >2000 chars
44
+ # and contain CLAUDE.md or system instructions
45
+ if len(content) > 2000 and ("CLAUDE.md" in content or "system" in content.lower()):
46
+ content_hash = hash_content(content)
47
+ if content_hash not in prompts:
48
+ prompts[content_hash] = content
49
+
50
+ return prompts
51
+
52
+
53
+ def detect_continuations(conversations_dir: Path) -> list[list[Path]]:
54
+ """
55
+ Detect conversation continuations across multiple JSONL files.
56
+
57
+ Uses:
58
+ - Session ID matching
59
+ - Temporal proximity (within 30 min)
60
+ - Same project directory
61
+
62
+ Returns: List of continuation chains (each chain is a list of file paths).
63
+ """
64
+ # TODO: Implement continuation detection
65
+ # For now, treat each file as independent
66
+ return [[f] for f in conversations_dir.rglob("*.jsonl")]
@@ -0,0 +1,149 @@
1
+ """Extract Claude chat transcripts from desktop/web app IndexedDB.
2
+
3
+ Claude desktop app stores conversations in LevelDB (IndexedDB backend).
4
+ This module extracts chat data for analysis.
5
+ """
6
+
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Iterator
10
+
11
+ try:
12
+ import plyvel
13
+
14
+ HAS_PLYVEL = True
15
+ except ImportError:
16
+ HAS_PLYVEL = False
17
+
18
+
19
+ def get_claude_indexeddb_path() -> Path:
20
+ """Get path to Claude desktop app's IndexedDB."""
21
+ return Path.home() / "Library/Application Support/Claude/IndexedDB/https_claude.ai_0.indexeddb.leveldb"
22
+
23
+
24
+ def extract_with_plyvel(db_path: Path) -> Iterator[dict]:
25
+ """Extract data using plyvel (requires: pip install plyvel)."""
26
+ if not HAS_PLYVEL:
27
+ raise ImportError("plyvel not installed. Run: pip install plyvel")
28
+
29
+ db = plyvel.DB(str(db_path), create_if_missing=False)
30
+
31
+ try:
32
+ for key, value in db:
33
+ # IndexedDB stores data with metadata prefixes
34
+ # We need to parse the binary format
35
+ try:
36
+ # Try to decode as JSON (some values are JSON strings)
37
+ decoded = value.decode("utf-8", errors="ignore")
38
+ if decoded.startswith("{") or decoded.startswith("["):
39
+ data = json.loads(decoded)
40
+ yield data
41
+ except (json.JSONDecodeError, UnicodeDecodeError):
42
+ continue
43
+ finally:
44
+ db.close()
45
+
46
+
47
+ def extract_with_chrome_devtools() -> Iterator[dict]:
48
+ """
49
+ Extract using Chrome DevTools Protocol (alternative method).
50
+
51
+ This requires the Claude desktop app to be running and accessible.
52
+ More reliable than parsing LevelDB directly.
53
+ """
54
+ # TODO: Implement CDP extraction
55
+ # This would connect to the running app and extract via JavaScript
56
+ raise NotImplementedError("Chrome DevTools extraction not yet implemented")
57
+
58
+
59
+ def extract_conversations_from_export(export_path: Path) -> Iterator[dict]:
60
+ """
61
+ Extract from Claude chat export (if available).
62
+
63
+ Claude.ai may offer export functionality - this handles that format.
64
+ """
65
+ if not export_path.exists():
66
+ raise FileNotFoundError(f"Export file not found: {export_path}")
67
+
68
+ with open(export_path, "r") as f:
69
+ data = json.load(f)
70
+
71
+ # Format depends on Claude's export structure
72
+ # Assuming it's a list of conversations
73
+ if isinstance(data, list):
74
+ yield from data
75
+ elif isinstance(data, dict) and "conversations" in data:
76
+ yield from data["conversations"]
77
+
78
+
79
+ def extract_claude_chats(method: str = "manual") -> Iterator[dict]:
80
+ """
81
+ Extract Claude chat transcripts.
82
+
83
+ Args:
84
+ method: Extraction method
85
+ - "manual": User manually exports from Claude.ai
86
+ - "plyvel": Direct LevelDB access (requires plyvel)
87
+ - "cdp": Chrome DevTools Protocol (requires running app)
88
+
89
+ Yields:
90
+ Chat conversation dictionaries
91
+ """
92
+ if method == "manual":
93
+ # Guide user to export manually
94
+ print("\n" + "=" * 70)
95
+ print("MANUAL EXPORT INSTRUCTIONS")
96
+ print("=" * 70)
97
+ print("\n1. Open Claude.ai in your browser")
98
+ print("2. Go to Settings > Data & Privacy")
99
+ print("3. Click 'Export my data'")
100
+ print("4. Download the export file")
101
+ print("5. Save it to: ~/.local/share/brainlayer/claude-export.json")
102
+ print("\nAlternatively, copy conversations manually from the web interface.")
103
+ print("=" * 70 + "\n")
104
+
105
+ export_path = Path.home() / ".local/share/brainlayer/claude-export.json"
106
+ if export_path.exists():
107
+ yield from extract_conversations_from_export(export_path)
108
+ else:
109
+ print(f"Waiting for export at: {export_path}")
110
+ return
111
+
112
+ elif method == "plyvel":
113
+ db_path = get_claude_indexeddb_path()
114
+ if not db_path.exists():
115
+ raise FileNotFoundError(f"Claude IndexedDB not found at: {db_path}")
116
+ yield from extract_with_plyvel(db_path)
117
+
118
+ elif method == "cdp":
119
+ yield from extract_with_chrome_devtools()
120
+
121
+ else:
122
+ raise ValueError(f"Unknown extraction method: {method}")
123
+
124
+
125
+ def format_claude_chat_for_pipeline(conversation: dict) -> dict:
126
+ """
127
+ Format Claude chat data to match brainlayer pipeline format.
128
+
129
+ Converts Claude's chat format to the JSONL format used by Claude Code.
130
+ """
131
+ # This depends on the actual format from Claude.ai
132
+ # Placeholder structure:
133
+ return {
134
+ "type": "conversation",
135
+ "id": conversation.get("id"),
136
+ "created_at": conversation.get("created_at"),
137
+ "messages": [
138
+ {
139
+ "role": msg.get("role", "user"),
140
+ "content": msg.get("content", ""),
141
+ "timestamp": msg.get("timestamp"),
142
+ }
143
+ for msg in conversation.get("messages", [])
144
+ ],
145
+ "metadata": {
146
+ "source": "claude_desktop",
147
+ "model": conversation.get("model"),
148
+ },
149
+ }
@@ -0,0 +1,231 @@
1
+ """Extract correction patterns from Claude/Gemini conversations.
2
+
3
+ Identifies where AI assistants helped draft text and user made corrections,
4
+ to learn what the user consistently changes.
5
+ """
6
+
7
+ import json
8
+ import re
9
+ from pathlib import Path
10
+ from typing import Iterator
11
+
12
+
13
+ def extract_claude_export_conversations(export_path: Path) -> Iterator[dict]:
14
+ """
15
+ Extract conversations from Claude.ai export.
16
+
17
+ Args:
18
+ export_path: Path to extracted conversations.json
19
+
20
+ Yields:
21
+ Conversation dictionaries with messages
22
+ """
23
+ with open(export_path, "r") as f:
24
+ data = json.load(f)
25
+
26
+ for conv in data:
27
+ yield {
28
+ "id": conv.get("uuid"),
29
+ "name": conv.get("name", "Untitled"),
30
+ "created_at": conv.get("created_at"),
31
+ "messages": [
32
+ {
33
+ "role": "user" if msg.get("sender") == "human" else "assistant",
34
+ "content": msg.get("text", ""),
35
+ "created_at": msg.get("created_at"),
36
+ }
37
+ for msg in conv.get("chat_messages", [])
38
+ ],
39
+ }
40
+
41
+
42
+ def is_draft_request(text: str) -> bool:
43
+ """Detect if user is asking for help drafting text."""
44
+ patterns = [
45
+ r"write (me )?a",
46
+ r"draft (me )?a",
47
+ r"help me write",
48
+ r"can you write",
49
+ r"make (me )?a",
50
+ r"does this sound",
51
+ r"is this (good|right|ok)",
52
+ r"how does this sound",
53
+ r"rewrite this",
54
+ r"improve this",
55
+ r"fix this",
56
+ r"better way to say",
57
+ ]
58
+ text_lower = text.lower()
59
+ return any(re.search(pattern, text_lower) for pattern in patterns)
60
+
61
+
62
+ def is_correction(user_msg: str, prev_assistant_msg: str) -> bool:
63
+ """
64
+ Detect if user message is correcting the assistant's draft.
65
+
66
+ Patterns:
67
+ - "No, make it..."
68
+ - "Actually, change it to..."
69
+ - "Too formal, use..."
70
+ - User provides alternative text after assistant's draft
71
+ """
72
+ user_lower = user_msg.lower()
73
+
74
+ correction_markers = [
75
+ "no,",
76
+ "actually,",
77
+ "instead",
78
+ "change it",
79
+ "make it",
80
+ "too formal",
81
+ "too long",
82
+ "too short",
83
+ "simpler",
84
+ "more casual",
85
+ "more professional",
86
+ "shorter",
87
+ "just say",
88
+ "better:",
89
+ "fix:",
90
+ "use this:",
91
+ ]
92
+
93
+ # Check for correction markers
94
+ if any(marker in user_lower for marker in correction_markers):
95
+ return True
96
+
97
+ # Check if user provides alternative text (quoted or after colon)
98
+ if '"' in user_msg or "'" in user_msg or ":" in user_msg:
99
+ # User might be providing alternative text
100
+ return True
101
+
102
+ return False
103
+
104
+
105
+ def extract_correction_pairs(conversation: dict) -> list[dict]:
106
+ """
107
+ Extract (draft, correction) pairs from a conversation.
108
+
109
+ Returns:
110
+ List of correction dictionaries with:
111
+ - original_draft: AI's draft
112
+ - user_correction: User's correction/feedback
113
+ - context: What they were drafting
114
+ """
115
+ corrections = []
116
+ messages = conversation["messages"]
117
+
118
+ for i in range(len(messages) - 1):
119
+ current = messages[i]
120
+ next_msg = messages[i + 1] if i + 1 < len(messages) else None
121
+
122
+ # Look for pattern: User asks for draft → AI provides → User corrects
123
+ if current["role"] == "user" and is_draft_request(current["content"]):
124
+ # Find AI's response
125
+ if next_msg and next_msg["role"] == "assistant":
126
+ ai_draft = next_msg["content"]
127
+
128
+ # Check if user corrects it
129
+ if i + 2 < len(messages):
130
+ user_response = messages[i + 2]
131
+ if user_response["role"] == "user" and is_correction(user_response["content"], ai_draft):
132
+ corrections.append(
133
+ {
134
+ "context": current["content"],
135
+ "ai_draft": ai_draft,
136
+ "user_correction": user_response["content"],
137
+ "conversation_name": conversation["name"],
138
+ }
139
+ )
140
+
141
+ return corrections
142
+
143
+
144
+ def analyze_correction_patterns(corrections: list[dict]) -> dict:
145
+ """
146
+ Analyze what user consistently changes in AI drafts.
147
+
148
+ Returns:
149
+ Dictionary with patterns like:
150
+ - length_changes: Shorter/longer
151
+ - formality_changes: More/less formal
152
+ - common_edits: Specific words/phrases changed
153
+ - tone_adjustments: Casual/professional
154
+ """
155
+ if not corrections:
156
+ return {}
157
+
158
+ patterns = {
159
+ "total_corrections": len(corrections),
160
+ "makes_shorter": 0,
161
+ "makes_longer": 0,
162
+ "removes_formality": 0,
163
+ "adds_emojis": 0,
164
+ "simplifies_language": 0,
165
+ "common_feedback": [],
166
+ }
167
+
168
+ for corr in corrections:
169
+ draft = corr["ai_draft"]
170
+ feedback = corr["user_correction"].lower()
171
+
172
+ # Length changes
173
+ if "shorter" in feedback or "too long" in feedback or "brief" in feedback:
174
+ patterns["makes_shorter"] += 1
175
+ if "longer" in feedback or "more detail" in feedback:
176
+ patterns["makes_longer"] += 1
177
+
178
+ # Formality
179
+ if any(word in feedback for word in ["casual", "informal", "too formal", "stiff"]):
180
+ patterns["removes_formality"] += 1
181
+
182
+ # Emojis
183
+ if any(char in corr["user_correction"] for char in ["😂", "😊", "🔥", "💪", "👍", "❤️"]):
184
+ patterns["adds_emojis"] += 1
185
+
186
+ # Simplification
187
+ if any(word in feedback for word in ["simpler", "simple", "easier", "plain"]):
188
+ patterns["simplifies_language"] += 1
189
+
190
+ # Extract common feedback phrases
191
+ feedback_phrases = [
192
+ "too formal",
193
+ "too long",
194
+ "too short",
195
+ "more casual",
196
+ "less formal",
197
+ "simpler",
198
+ "add emoji",
199
+ "remove",
200
+ ]
201
+ for phrase in feedback_phrases:
202
+ if phrase in feedback:
203
+ patterns["common_feedback"].append(phrase)
204
+
205
+ return patterns
206
+
207
+
208
+ def extract_user_final_versions(corrections: list[dict]) -> list[str]:
209
+ """
210
+ Extract the final versions user actually used/approved.
211
+
212
+ These are the gold standard examples of their style.
213
+ """
214
+ final_versions = []
215
+
216
+ for corr in corrections:
217
+ # Try to extract quoted text from user's correction
218
+ user_text = corr["user_correction"]
219
+
220
+ # Look for quoted text
221
+ quoted = re.findall(r'"([^"]+)"', user_text)
222
+ if quoted:
223
+ final_versions.extend(quoted)
224
+
225
+ # Look for text after colon
226
+ if ":" in user_text:
227
+ parts = user_text.split(":", 1)
228
+ if len(parts) > 1:
229
+ final_versions.append(parts[1].strip())
230
+
231
+ return final_versions
@@ -0,0 +1,195 @@
1
+ """Extract and classify markdown files for indexing."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Iterator
6
+
7
+ from .classify import ClassifiedContent, ContentType, ContentValue
8
+
9
+
10
+ def find_markdown_files(
11
+ root: Path, patterns: list[str] | None = None, exclude: list[str] | None = None
12
+ ) -> Iterator[Path]:
13
+ """
14
+ Find markdown files matching glob patterns.
15
+
16
+ Args:
17
+ root: Root directory to search
18
+ patterns: Glob patterns to match (default: ["**/*.md"])
19
+ exclude: Directory names to exclude (default: common non-content dirs)
20
+
21
+ Yields:
22
+ Path objects for matching markdown files
23
+ """
24
+ if patterns is None:
25
+ patterns = ["**/*.md"]
26
+ if exclude is None:
27
+ exclude = ["node_modules", ".git", "dist", "__pycache__", ".venv", "venv"]
28
+
29
+ exclude_set = set(exclude)
30
+
31
+ for pattern in patterns:
32
+ for path in root.glob(pattern):
33
+ # Skip excluded directories
34
+ if any(part in exclude_set for part in path.parts):
35
+ continue
36
+ if path.is_file():
37
+ yield path
38
+
39
+
40
+ def parse_markdown(file_path: Path) -> list[dict]:
41
+ """
42
+ Parse a markdown file into sections.
43
+
44
+ Splits on h2 headers (## ) to create logical chunks.
45
+ Each section includes header hierarchy for context.
46
+
47
+ Returns:
48
+ List of dicts with keys: content, header, parent_headers, tags
49
+ """
50
+ content = file_path.read_text(encoding="utf-8", errors="replace")
51
+ sections = []
52
+
53
+ # Extract frontmatter if present
54
+ frontmatter = {}
55
+ if content.startswith("---"):
56
+ parts = content.split("---", 2)
57
+ if len(parts) >= 3:
58
+ # Simple YAML parsing for common keys
59
+ for line in parts[1].strip().split("\n"):
60
+ if ":" in line:
61
+ key, _, value = line.partition(":")
62
+ frontmatter[key.strip()] = value.strip().strip("\"'")
63
+ content = parts[2]
64
+
65
+ # Extract tags from content (#tag patterns not in code blocks)
66
+ tags = set()
67
+ for match in re.finditer(r"(?<!\S)#([a-zA-Z][a-zA-Z0-9_-]*)", content):
68
+ tag = match.group(1).lower()
69
+ # Skip common markdown/code patterns
70
+ if tag not in ("include", "define", "ifdef", "endif", "pragma", "import"):
71
+ tags.add(tag)
72
+
73
+ # Split by h2 headers
74
+ h1_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
75
+ h1_title = h1_match.group(1) if h1_match else file_path.stem
76
+
77
+ # Find all h2 sections
78
+ h2_pattern = r"^##\s+(.+)$"
79
+ h2_positions = [(m.start(), m.group(1)) for m in re.finditer(h2_pattern, content, re.MULTILINE)]
80
+
81
+ if not h2_positions:
82
+ # No h2 headers - treat entire file as one section
83
+ sections.append(
84
+ {
85
+ "content": content.strip(),
86
+ "header": h1_title,
87
+ "parent_headers": [],
88
+ "tags": list(tags),
89
+ "frontmatter": frontmatter,
90
+ }
91
+ )
92
+ else:
93
+ # Content before first h2
94
+ if h2_positions[0][0] > 0:
95
+ intro = content[: h2_positions[0][0]].strip()
96
+ if intro and len(intro) > 50: # Skip trivial intros
97
+ sections.append(
98
+ {
99
+ "content": intro,
100
+ "header": h1_title,
101
+ "parent_headers": [],
102
+ "tags": list(tags),
103
+ "frontmatter": frontmatter,
104
+ }
105
+ )
106
+
107
+ # Each h2 section
108
+ for i, (pos, header) in enumerate(h2_positions):
109
+ end_pos = h2_positions[i + 1][0] if i + 1 < len(h2_positions) else len(content)
110
+ section_content = content[pos:end_pos].strip()
111
+
112
+ sections.append(
113
+ {
114
+ "content": section_content,
115
+ "header": header,
116
+ "parent_headers": [h1_title],
117
+ "tags": list(tags),
118
+ "frontmatter": frontmatter,
119
+ }
120
+ )
121
+
122
+ return sections
123
+
124
+
125
+ def classify_by_path(file_path: Path) -> tuple[ContentType, ContentValue]:
126
+ """
127
+ Classify a markdown file based on its path.
128
+
129
+ Returns:
130
+ Tuple of (ContentType, ContentValue) based on path patterns
131
+ """
132
+ path_str = str(file_path).lower()
133
+ name = file_path.name.lower()
134
+
135
+ # CLAUDE.md files are high-value project config
136
+ if name == "claude.md":
137
+ return ContentType.PROJECT_CONFIG, ContentValue.HIGH
138
+
139
+ # Learnings are curated high-value content
140
+ if "/learnings/" in path_str or "learnings/" in path_str:
141
+ return ContentType.LEARNING, ContentValue.HIGH
142
+
143
+ # Skills are high-value reference
144
+ if "/skills/" in path_str or "skills/" in path_str:
145
+ return ContentType.SKILL, ContentValue.HIGH
146
+
147
+ # Research docs
148
+ if "/research/" in path_str or "research/" in path_str:
149
+ return ContentType.RESEARCH, ContentValue.HIGH
150
+
151
+ # PRD archives - medium value
152
+ if "/prd" in path_str or "prd-" in path_str or "prd_" in path_str:
153
+ return ContentType.PRD_ARCHIVE, ContentValue.MEDIUM
154
+
155
+ # Verification rounds - low value (voluminous)
156
+ if "/verification" in path_str or "verification-" in path_str:
157
+ return ContentType.VERIFICATION, ContentValue.LOW
158
+
159
+ # Default: documentation
160
+ return ContentType.DOCUMENTATION, ContentValue.MEDIUM
161
+
162
+
163
+ def extract_markdown_content(file_path: Path) -> list[ClassifiedContent]:
164
+ """
165
+ Extract and classify content from a markdown file.
166
+
167
+ Combines parsing and classification into ready-to-chunk content.
168
+
169
+ Returns:
170
+ List of ClassifiedContent objects for each section
171
+ """
172
+ content_type, value = classify_by_path(file_path)
173
+ sections = parse_markdown(file_path)
174
+
175
+ results = []
176
+ for section in sections:
177
+ # Build context string with header hierarchy
178
+ header_context = " > ".join(section["parent_headers"] + [section["header"]])
179
+
180
+ results.append(
181
+ ClassifiedContent(
182
+ content=section["content"],
183
+ content_type=content_type,
184
+ value=value,
185
+ metadata={
186
+ "source_file": str(file_path),
187
+ "header": section["header"],
188
+ "header_context": header_context,
189
+ "tags": section["tags"],
190
+ "frontmatter": section.get("frontmatter", {}),
191
+ },
192
+ )
193
+ )
194
+
195
+ return results