buildlog 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
buildlog/distill.py ADDED
@@ -0,0 +1,374 @@
1
+ """Extract and aggregate patterns from buildlog entries."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "CATEGORIES",
7
+ "DistillResult",
8
+ "distill_all",
9
+ "format_output",
10
+ "parse_improvements",
11
+ "parse_date_from_filename",
12
+ "iter_buildlog_entries",
13
+ ]
14
+
15
+ import json
16
+ import logging
17
+ import re
18
+ from collections.abc import Iterator
19
+ from dataclasses import dataclass, field
20
+ from datetime import UTC, date, datetime
21
+ from pathlib import Path
22
+ from typing import Final, Literal, TypedDict
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Valid improvement categories (lowercase for matching)
27
+ CATEGORIES: Final[tuple[str, ...]] = (
28
+ "architectural",
29
+ "workflow",
30
+ "tool_usage",
31
+ "domain_knowledge",
32
+ )
33
+
34
+ # Map from markdown heading to normalized category name
35
+ CATEGORY_MAP: Final[dict[str, str]] = {
36
+ "architectural": "architectural",
37
+ "workflow": "workflow",
38
+ "tool usage": "tool_usage",
39
+ "tool_usage": "tool_usage",
40
+ "domain knowledge": "domain_knowledge",
41
+ "domain_knowledge": "domain_knowledge",
42
+ }
43
+
44
+ # File matching pattern for buildlog entries
45
+ BUILDLOG_GLOB_PATTERN: Final[str] = "20??-??-??-*.md"
46
+
47
+ # Type definitions
48
+ OutputFormat = Literal["json", "yaml"]
49
+
50
+
51
+ class PatternDict(TypedDict):
52
+ """Type for a single pattern dictionary."""
53
+
54
+ insight: str
55
+ source: str
56
+ date: str
57
+ context: str
58
+
59
+
60
+ class StatisticsDict(TypedDict):
61
+ """Type for statistics dictionary."""
62
+
63
+ total_patterns: int
64
+ by_category: dict[str, int]
65
+ by_month: dict[str, int]
66
+
67
+
68
+ class DistillResultDict(TypedDict):
69
+ """Type for full distill result dictionary."""
70
+
71
+ extracted_at: str
72
+ entry_count: int
73
+ patterns: dict[str, list[PatternDict]]
74
+ statistics: StatisticsDict
75
+
76
+
77
+ @dataclass
78
+ class DistillResult:
79
+ """Aggregated patterns from all buildlog entries."""
80
+
81
+ extracted_at: str
82
+ entry_count: int
83
+ patterns: dict[str, list[PatternDict]] = field(default_factory=dict)
84
+ statistics: StatisticsDict = field(default_factory=dict)
85
+
86
+ def to_dict(self) -> DistillResultDict:
87
+ """Convert to dictionary for JSON/YAML serialization."""
88
+ return {
89
+ "extracted_at": self.extracted_at,
90
+ "entry_count": self.entry_count,
91
+ "patterns": self.patterns,
92
+ "statistics": self.statistics,
93
+ }
94
+
95
+
96
+ def _is_valid_insight(insight: str) -> bool:
97
+ """Filter predicate for valid insights (not placeholders)."""
98
+ if not insight:
99
+ return False
100
+ if insight.startswith("[") and insight.endswith("]"):
101
+ return False
102
+ if insight.startswith("e.g.,"):
103
+ return False
104
+ return True
105
+
106
+
107
+ def extract_title_and_context(content: str) -> str:
108
+ """Extract a context description from the entry title."""
109
+ match = re.search(r"^#\s+Build Journal:\s*(.+)$", content, re.MULTILINE)
110
+ if match:
111
+ title = match.group(1).strip()
112
+ if title and title != "[TITLE]":
113
+ return title
114
+ return ""
115
+
116
+
117
+ def _parse_bullet_content(category_content: str) -> list[str]:
118
+ """Parse bullet points from category content, handling multi-line bullets.
119
+
120
+ A bullet can span multiple lines if continuation lines are indented.
121
+ Example:
122
+ - This is a long insight that
123
+ continues on the next line
124
+
125
+ Returns:
126
+ List of complete bullet point texts.
127
+ """
128
+ bullets: list[str] = []
129
+ current_bullet: list[str] = []
130
+
131
+ for line in category_content.split("\n"):
132
+ # New bullet point starts with optional whitespace, dash, space, then content
133
+ bullet_match = re.match(r"^\s*-\s+(.+)$", line)
134
+ if bullet_match:
135
+ # Save previous bullet if exists
136
+ if current_bullet:
137
+ bullets.append(" ".join(current_bullet))
138
+ current_bullet = [bullet_match.group(1).strip()]
139
+ elif current_bullet and line.strip():
140
+ # Continuation line: non-empty, not a new bullet
141
+ # Must be indented (starts with whitespace) to be a continuation
142
+ if line.startswith((" ", "\t")):
143
+ current_bullet.append(line.strip())
144
+ # Otherwise it's unrelated content, ignore it
145
+
146
+ # Don't forget the last bullet
147
+ if current_bullet:
148
+ bullets.append(" ".join(current_bullet))
149
+
150
+ return bullets
151
+
152
+
153
+ def parse_improvements(content: str) -> dict[str, list[str]]:
154
+ """Extract Improvements section from buildlog markdown.
155
+
156
+ Args:
157
+ content: The full markdown content of a buildlog entry.
158
+
159
+ Returns:
160
+ Dictionary mapping category names to lists of improvement insights.
161
+ """
162
+ result: dict[str, list[str]] = {cat: [] for cat in CATEGORIES}
163
+
164
+ # Stop at any H1 or H2 header (not H3+), or end of string
165
+ improvements_match = re.search(
166
+ r"^##\s+Improvements\s*\n(.*?)(?=^#{1,2}\s|\Z)",
167
+ content,
168
+ re.MULTILINE | re.DOTALL,
169
+ )
170
+
171
+ if not improvements_match:
172
+ return result
173
+
174
+ improvements_section = improvements_match.group(1)
175
+
176
+ # Match H3 headers but NOT H4+ (use negative lookahead for 4th #)
177
+ category_pattern = re.compile(
178
+ r"^###(?!#)\s+([^\n]+)\s*\n(.*?)(?=^###(?!#)|\Z)", re.MULTILINE | re.DOTALL
179
+ )
180
+
181
+ for category_match in category_pattern.finditer(improvements_section):
182
+ raw_category = category_match.group(1).strip().lower()
183
+ normalized = CATEGORY_MAP.get(raw_category)
184
+ if not normalized:
185
+ continue
186
+
187
+ category_content = category_match.group(2)
188
+ bullets = _parse_bullet_content(category_content)
189
+ result[normalized] = list(filter(_is_valid_insight, bullets))
190
+
191
+ return result
192
+
193
+
194
+ def parse_date_from_filename(filename: str) -> str | None:
195
+ """Extract date from buildlog filename (YYYY-MM-DD-slug.md format)."""
196
+ match = re.match(r"^(\d{4}-\d{2}-\d{2})-", filename)
197
+ return match.group(1) if match else None
198
+
199
+
200
+ def _extract_month_key(date_str: str) -> str:
201
+ """Extract YYYY-MM month key from YYYY-MM-DD date string."""
202
+ return date_str[:7]
203
+
204
+
205
+ def iter_buildlog_entries(
206
+ buildlog_dir: Path, since: date | None = None
207
+ ) -> Iterator[tuple[Path, str]]:
208
+ """Iterate over buildlog entries, optionally filtered by date.
209
+
210
+ Args:
211
+ buildlog_dir: Path to the buildlog directory.
212
+ since: If provided, only yield entries from this date onward.
213
+
214
+ Yields:
215
+ Tuples of (file_path, date_string) for each matching entry.
216
+ """
217
+ for entry_path in sorted(buildlog_dir.glob(BUILDLOG_GLOB_PATTERN)):
218
+ date_str = parse_date_from_filename(entry_path.name)
219
+ if not date_str:
220
+ continue
221
+
222
+ # Always validate the date, not just when filtering
223
+ try:
224
+ entry_date = date.fromisoformat(date_str)
225
+ except ValueError:
226
+ logger.warning("Invalid date in filename: %s", entry_path.name)
227
+ continue
228
+
229
+ if since and entry_date < since:
230
+ continue
231
+
232
+ yield entry_path, date_str
233
+
234
+
235
+ def _create_patterns_for_entry(
236
+ improvements: dict[str, list[str]],
237
+ source: str,
238
+ date_str: str,
239
+ context: str,
240
+ ) -> dict[str, list[PatternDict]]:
241
+ """Create pattern dicts from improvements - pure function."""
242
+ return {
243
+ category: [
244
+ PatternDict(
245
+ insight=insight,
246
+ source=source,
247
+ date=date_str,
248
+ context=context,
249
+ )
250
+ for insight in insights
251
+ ]
252
+ for category, insights in improvements.items()
253
+ }
254
+
255
+
256
+ def _merge_patterns(
257
+ target: dict[str, list[PatternDict]],
258
+ source: dict[str, list[PatternDict]],
259
+ ) -> None:
260
+ """Merge source patterns into target (mutates target)."""
261
+ for category, patterns in source.items():
262
+ if category in target:
263
+ target[category].extend(patterns)
264
+
265
+
266
+ def _apply_category_filter(
267
+ patterns: dict[str, list[PatternDict]],
268
+ category: str | None,
269
+ ) -> dict[str, list[PatternDict]]:
270
+ """Filter patterns to single category if specified."""
271
+ if category is None:
272
+ return patterns
273
+ return {category: patterns.get(category, [])}
274
+
275
+
276
+ def _compute_statistics(
277
+ patterns: dict[str, list[PatternDict]],
278
+ by_month: dict[str, int],
279
+ ) -> StatisticsDict:
280
+ """Compute statistics from aggregated patterns."""
281
+ by_category = {cat: len(items) for cat, items in patterns.items()}
282
+ return {
283
+ "total_patterns": sum(by_category.values()),
284
+ "by_category": by_category,
285
+ "by_month": dict(sorted(by_month.items())),
286
+ }
287
+
288
+
289
+ def distill_all(
290
+ buildlog_dir: Path,
291
+ since: date | None = None,
292
+ category_filter: str | None = None,
293
+ ) -> DistillResult:
294
+ """Parse all buildlog entries and aggregate patterns.
295
+
296
+ Args:
297
+ buildlog_dir: Path to the buildlog directory.
298
+ since: If provided, only include entries from this date onward.
299
+ category_filter: If provided, only include patterns from this category.
300
+
301
+ Returns:
302
+ DistillResult with aggregated patterns and statistics.
303
+ """
304
+ patterns: dict[str, list[PatternDict]] = {cat: [] for cat in CATEGORIES}
305
+ by_month: dict[str, int] = {}
306
+ entry_count = 0
307
+
308
+ for entry_path, date_str in iter_buildlog_entries(buildlog_dir, since):
309
+ try:
310
+ content = entry_path.read_text(encoding="utf-8")
311
+ except (OSError, UnicodeDecodeError) as e:
312
+ logger.warning("Failed to read %s: %s", entry_path, e)
313
+ continue
314
+
315
+ entry_count += 1
316
+ context = extract_title_and_context(content)
317
+
318
+ month_key = _extract_month_key(date_str)
319
+ by_month[month_key] = by_month.get(month_key, 0) + 1
320
+
321
+ try:
322
+ improvements = parse_improvements(content)
323
+ except re.error as e:
324
+ logger.warning("Failed to parse improvements in %s: %s", entry_path, e)
325
+ continue
326
+
327
+ entry_patterns = _create_patterns_for_entry(
328
+ improvements, str(entry_path), date_str, context
329
+ )
330
+ _merge_patterns(patterns, entry_patterns)
331
+
332
+ patterns = _apply_category_filter(patterns, category_filter)
333
+ statistics = _compute_statistics(patterns, by_month)
334
+
335
+ return DistillResult(
336
+ extracted_at=datetime.now(UTC).isoformat().replace("+00:00", "Z"),
337
+ entry_count=entry_count,
338
+ patterns=patterns,
339
+ statistics=statistics,
340
+ )
341
+
342
+
343
+ def format_output(result: DistillResult, fmt: OutputFormat = "json") -> str:
344
+ """Format the distill result as JSON or YAML.
345
+
346
+ Args:
347
+ result: The DistillResult to format.
348
+ fmt: Output format, either "json" or "yaml".
349
+
350
+ Returns:
351
+ Formatted string representation.
352
+
353
+ Raises:
354
+ ValueError: If format is not recognized.
355
+ ImportError: If PyYAML is required but not installed.
356
+ """
357
+ data = result.to_dict()
358
+
359
+ if fmt == "json":
360
+ return json.dumps(data, indent=2, ensure_ascii=False)
361
+
362
+ if fmt == "yaml":
363
+ try:
364
+ import yaml
365
+ except ImportError as e:
366
+ raise ImportError(
367
+ "PyYAML is required for YAML output. Install it with: pip install pyyaml"
368
+ ) from e
369
+ return yaml.dump(
370
+ data, default_flow_style=False, allow_unicode=True, sort_keys=False
371
+ )
372
+
373
+ # This should be unreachable due to Literal type, but defensive coding
374
+ raise ValueError(f"Unknown format: {fmt}")