okb 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,484 @@
1
+ """GitHub API source for syncing repository content."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from datetime import UTC, datetime
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from github import Github
11
+ from github.Repository import Repository
12
+
13
+ from okb.ingest import Document
14
+ from okb.plugins.base import SyncState
15
+
16
+ # Extensions that can be ingested (matches config.py defaults)
17
+ INGESTABLE_EXTENSIONS = frozenset(
18
+ [
19
+ # Documents
20
+ ".md",
21
+ ".txt",
22
+ ".markdown",
23
+ ".org",
24
+ # Code
25
+ ".py",
26
+ ".rb",
27
+ ".js",
28
+ ".ts",
29
+ ".jsx",
30
+ ".tsx",
31
+ ".sql",
32
+ ".sh",
33
+ ".bash",
34
+ ".fish",
35
+ ".yaml",
36
+ ".yml",
37
+ ".toml",
38
+ ".json",
39
+ ".html",
40
+ ".css",
41
+ ".scss",
42
+ ".go",
43
+ ".rs",
44
+ ".java",
45
+ ".kt",
46
+ ".c",
47
+ ".cpp",
48
+ ".h",
49
+ ]
50
+ )
51
+
52
+ # Priority label mapping (label -> priority 1-5, 1=highest)
53
+ PRIORITY_LABELS = {
54
+ # Explicit priority labels
55
+ "priority:critical": 1,
56
+ "priority:high": 2,
57
+ "priority:medium": 3,
58
+ "priority:low": 4,
59
+ "p0": 1,
60
+ "p1": 2,
61
+ "p2": 3,
62
+ "p3": 4,
63
+ "p4": 5,
64
+ # Severity/type labels
65
+ "critical": 1,
66
+ "urgent": 1,
67
+ "bug": 2,
68
+ "security": 2,
69
+ "enhancement": 4,
70
+ "feature": 4,
71
+ "documentation": 5,
72
+ "question": 5,
73
+ }
74
+
75
+
76
+ def _get_priority_from_labels(labels: list) -> int | None:
77
+ """Extract priority from GitHub labels."""
78
+ for label in labels:
79
+ label_name = label.name.lower()
80
+ if label_name in PRIORITY_LABELS:
81
+ return PRIORITY_LABELS[label_name]
82
+ return None
83
+
84
+
85
+ class GitHubSource:
86
+ """API source for GitHub repository content.
87
+
88
+ Syncs repository files, issues, PRs, and wiki pages.
89
+
90
+ Config example:
91
+ plugins:
92
+ sources:
93
+ github:
94
+ enabled: true
95
+ token: ${GITHUB_TOKEN}
96
+
97
+ Usage:
98
+ lkb sync run github --repo owner/repo # README + docs/ (default)
99
+ lkb sync run github --repo owner/repo --source # All source files
100
+ lkb sync run github --repo owner/repo --issues # Include issues
101
+ lkb sync run github --repo owner/repo --prs # Include PRs
102
+ lkb sync run github --repo owner/repo --wiki # Include wiki
103
+ """
104
+
105
+ name = "github"
106
+ source_type = "github-source"
107
+
108
+ def __init__(self) -> None:
109
+ self._client: Github | None = None
110
+ self._token: str | None = None
111
+ self._repos: list[str] = []
112
+ self._include_source: bool = False
113
+ self._include_issues: bool = False
114
+ self._include_prs: bool = False
115
+ self._include_wiki: bool = False
116
+
117
+ def configure(self, config: dict) -> None:
118
+ """Initialize GitHub client with token.
119
+
120
+ Args:
121
+ config: Source configuration containing 'token' and CLI options
122
+ """
123
+ from github import Github
124
+
125
+ token = config.get("token")
126
+ if not token:
127
+ raise ValueError("github source requires 'token' in config")
128
+
129
+ repos = config.get("repos", [])
130
+ if not repos:
131
+ raise ValueError("github source requires --repo flag")
132
+
133
+ self._client = Github(token)
134
+ self._token = token
135
+ self._repos = repos
136
+ self._include_source = config.get("include_source", False)
137
+ self._include_issues = config.get("include_issues", False)
138
+ self._include_prs = config.get("include_prs", False)
139
+ self._include_wiki = config.get("include_wiki", False)
140
+
141
+ def fetch(self, state: SyncState | None = None) -> tuple[list[Document], SyncState]:
142
+ """Fetch content from GitHub repositories.
143
+
144
+ Args:
145
+ state: Previous sync state for incremental updates
146
+
147
+ Returns:
148
+ Tuple of (list of documents, new sync state)
149
+ """
150
+ from okb.plugins.base import SyncState as SyncStateClass
151
+
152
+ if self._client is None:
153
+ raise RuntimeError("Source not configured. Call configure() first.")
154
+
155
+ documents: list[Document] = []
156
+ extra = state.extra if state else {}
157
+ last_sync = state.last_sync if state else None
158
+
159
+ for repo_name in self._repos:
160
+ print(f"Syncing {repo_name}...", file=sys.stderr)
161
+ try:
162
+ repo = self._client.get_repo(repo_name)
163
+ repo_extra = extra.get(repo_name, {})
164
+
165
+ # Sync source files (default: README + docs/, or all with --source)
166
+ source_docs, new_sha = self._sync_source_files(repo, repo_extra.get("commit_sha"))
167
+ documents.extend(source_docs)
168
+ extra[repo_name] = {"commit_sha": new_sha}
169
+
170
+ # Sync issues if requested
171
+ if self._include_issues:
172
+ issue_docs = self._sync_issues(repo, last_sync)
173
+ documents.extend(issue_docs)
174
+
175
+ # Sync PRs if requested
176
+ if self._include_prs:
177
+ pr_docs = self._sync_prs(repo, last_sync)
178
+ documents.extend(pr_docs)
179
+
180
+ # Sync wiki if requested
181
+ if self._include_wiki:
182
+ wiki_docs = self._sync_wiki(repo)
183
+ documents.extend(wiki_docs)
184
+
185
+ except Exception as e:
186
+ print(f" Error syncing {repo_name}: {e}", file=sys.stderr)
187
+
188
+ new_state = SyncStateClass(
189
+ last_sync=datetime.now(UTC),
190
+ extra=extra,
191
+ )
192
+
193
+ return documents, new_state
194
+
195
+ def _sync_source_files(
196
+ self, repo: Repository, last_sha: str | None
197
+ ) -> tuple[list[Document], str]:
198
+ """Sync source files from the repository.
199
+
200
+ Args:
201
+ repo: GitHub repository object
202
+ last_sha: Last synced commit SHA
203
+
204
+ Returns:
205
+ Tuple of (documents, current commit SHA)
206
+ """
207
+ from okb.ingest import Document, DocumentMetadata
208
+
209
+ documents: list[Document] = []
210
+
211
+ # Get current HEAD commit
212
+ default_branch = repo.default_branch
213
+ current_sha = repo.get_branch(default_branch).commit.sha
214
+
215
+ # Skip if no changes
216
+ if last_sha == current_sha:
217
+ print(f" Source files unchanged (SHA: {current_sha[:8]})", file=sys.stderr)
218
+ return documents, current_sha
219
+
220
+ print(f" Fetching source files (SHA: {current_sha[:8]})...", file=sys.stderr)
221
+
222
+ # Get repository tree
223
+ tree = repo.get_git_tree(current_sha, recursive=True)
224
+
225
+ for item in tree.tree:
226
+ if item.type != "blob":
227
+ continue
228
+
229
+ path = item.path
230
+
231
+ # Check if file should be included
232
+ if not self._should_include_file(path):
233
+ continue
234
+
235
+ # Check extension
236
+ ext = "." + path.rsplit(".", 1)[-1] if "." in path else ""
237
+ if ext.lower() not in INGESTABLE_EXTENSIONS:
238
+ continue
239
+
240
+ try:
241
+ content = self._get_file_content(repo, item.sha)
242
+ if content is None:
243
+ continue
244
+
245
+ title = path.split("/")[-1]
246
+ doc = Document(
247
+ source_path=f"github://{repo.full_name}/blob/{default_branch}/{path}",
248
+ source_type="github-source",
249
+ title=title,
250
+ content=content,
251
+ metadata=DocumentMetadata(
252
+ project=repo.name,
253
+ extra={
254
+ "repo": repo.full_name,
255
+ "path": path,
256
+ "sha": item.sha,
257
+ },
258
+ ),
259
+ )
260
+ documents.append(doc)
261
+ print(f" Synced: {path}", file=sys.stderr)
262
+ except Exception as e:
263
+ print(f" Error fetching {path}: {e}", file=sys.stderr)
264
+
265
+ return documents, current_sha
266
+
267
+ def _should_include_file(self, path: str) -> bool:
268
+ """Check if a file should be included based on sync options."""
269
+ if self._include_source:
270
+ # Include all files
271
+ return True
272
+
273
+ # Default: README* at root + docs/**/*
274
+ path_lower = path.lower()
275
+
276
+ # README files at root
277
+ if "/" not in path and path_lower.startswith("readme"):
278
+ return True
279
+
280
+ # Files in docs/ directory
281
+ if path_lower.startswith("docs/"):
282
+ return True
283
+
284
+ return False
285
+
286
+ def _get_file_content(self, repo: Repository, sha: str) -> str | None:
287
+ """Get file content by blob SHA."""
288
+ import base64
289
+
290
+ blob = repo.get_git_blob(sha)
291
+ if blob.encoding == "base64":
292
+ try:
293
+ return base64.b64decode(blob.content).decode("utf-8")
294
+ except UnicodeDecodeError:
295
+ return None # Binary file
296
+ return blob.content
297
+
298
+ def _sync_issues(self, repo: Repository, since: datetime | None) -> list[Document]:
299
+ """Sync GitHub issues."""
300
+ from okb.ingest import Document, DocumentMetadata
301
+
302
+ documents: list[Document] = []
303
+ print(" Fetching issues...", file=sys.stderr)
304
+
305
+ # Fetch issues updated since last sync
306
+ kwargs = {"state": "all", "sort": "updated", "direction": "desc"}
307
+ if since:
308
+ kwargs["since"] = since
309
+
310
+ issues = repo.get_issues(**kwargs)
311
+ count = 0
312
+
313
+ for issue in issues:
314
+ # Skip pull requests (they show up in issues API)
315
+ if issue.pull_request is not None:
316
+ continue
317
+
318
+ # Build content (title + body, no comments per user request)
319
+ content = f"# {issue.title}\n\n{issue.body or ''}"
320
+
321
+ # Extract labels
322
+ labels = [label.name for label in issue.labels]
323
+
324
+ doc = Document(
325
+ source_path=f"github://{repo.full_name}/issues/{issue.number}",
326
+ source_type="github-issue",
327
+ title=f"#{issue.number}: {issue.title}",
328
+ content=content,
329
+ status="open" if issue.state == "open" else "closed",
330
+ priority=_get_priority_from_labels(issue.labels),
331
+ metadata=DocumentMetadata(
332
+ project=repo.name,
333
+ tags=labels,
334
+ extra={
335
+ "repo": repo.full_name,
336
+ "number": issue.number,
337
+ "author": issue.user.login if issue.user else None,
338
+ "created_at": issue.created_at.isoformat() if issue.created_at else None,
339
+ "updated_at": issue.updated_at.isoformat() if issue.updated_at else None,
340
+ "url": issue.html_url,
341
+ },
342
+ ),
343
+ )
344
+ documents.append(doc)
345
+ count += 1
346
+
347
+ print(f" Synced {count} issues", file=sys.stderr)
348
+ return documents
349
+
350
+ def _sync_prs(self, repo: Repository, since: datetime | None) -> list[Document]:
351
+ """Sync GitHub pull requests."""
352
+ from okb.ingest import Document, DocumentMetadata
353
+
354
+ documents: list[Document] = []
355
+ print(" Fetching pull requests...", file=sys.stderr)
356
+
357
+ # Fetch PRs - unfortunately the PR API doesn't have a 'since' parameter
358
+ # We'll filter by updated_at manually
359
+ pulls = repo.get_pulls(state="all", sort="updated", direction="desc")
360
+ count = 0
361
+
362
+ for pr in pulls:
363
+ # Skip if older than last sync
364
+ if since and pr.updated_at and pr.updated_at < since:
365
+ break # PRs are sorted by updated, so we can stop here
366
+
367
+ # Build content (title + body, no comments per user request)
368
+ content = f"# {pr.title}\n\n{pr.body or ''}"
369
+
370
+ # Extract labels
371
+ labels = [label.name for label in pr.labels]
372
+
373
+ # Determine status
374
+ if pr.merged:
375
+ status = "merged"
376
+ elif pr.state == "open":
377
+ status = "open"
378
+ else:
379
+ status = "closed"
380
+
381
+ doc = Document(
382
+ source_path=f"github://{repo.full_name}/pull/{pr.number}",
383
+ source_type="github-pr",
384
+ title=f"PR #{pr.number}: {pr.title}",
385
+ content=content,
386
+ status=status,
387
+ priority=_get_priority_from_labels(pr.labels),
388
+ metadata=DocumentMetadata(
389
+ project=repo.name,
390
+ tags=labels,
391
+ extra={
392
+ "repo": repo.full_name,
393
+ "number": pr.number,
394
+ "author": pr.user.login if pr.user else None,
395
+ "created_at": pr.created_at.isoformat() if pr.created_at else None,
396
+ "updated_at": pr.updated_at.isoformat() if pr.updated_at else None,
397
+ "merged_at": pr.merged_at.isoformat() if pr.merged_at else None,
398
+ "url": pr.html_url,
399
+ "base": pr.base.ref,
400
+ "head": pr.head.ref,
401
+ },
402
+ ),
403
+ )
404
+ documents.append(doc)
405
+ count += 1
406
+
407
+ print(f" Synced {count} pull requests", file=sys.stderr)
408
+ return documents
409
+
410
+ def _sync_wiki(self, repo: Repository) -> list[Document]:
411
+ """Sync GitHub wiki pages.
412
+
413
+ Note: GitHub doesn't have an API for wiki content.
414
+ We clone the wiki repo and read files directly.
415
+ """
416
+ from okb.ingest import Document, DocumentMetadata
417
+
418
+ documents: list[Document] = []
419
+
420
+ # Check if wiki exists
421
+ if not repo.has_wiki:
422
+ print(" Wiki not enabled for this repo", file=sys.stderr)
423
+ return documents
424
+
425
+ print(" Fetching wiki pages...", file=sys.stderr)
426
+
427
+ # Wiki is a separate git repo at {repo}.wiki.git
428
+ # We need to use git to clone it
429
+ import tempfile
430
+ from pathlib import Path
431
+ from subprocess import CalledProcessError, run
432
+
433
+ # Use token in URL for authentication
434
+ wiki_url = f"https://{self._token}@github.com/{repo.full_name}.wiki.git"
435
+
436
+ with tempfile.TemporaryDirectory() as tmpdir:
437
+ try:
438
+ # Clone wiki repo (shallow)
439
+ result = run(
440
+ ["git", "clone", "--depth", "1", wiki_url, tmpdir],
441
+ capture_output=True,
442
+ text=True,
443
+ timeout=60,
444
+ )
445
+ if result.returncode != 0:
446
+ # Wiki might not exist even if has_wiki is True
447
+ print(" Wiki repository not accessible", file=sys.stderr)
448
+ return documents
449
+
450
+ # Find all markdown files
451
+ wiki_path = Path(tmpdir)
452
+ count = 0
453
+
454
+ for md_file in wiki_path.glob("*.md"):
455
+ try:
456
+ content = md_file.read_text(encoding="utf-8")
457
+ title = md_file.stem.replace("-", " ")
458
+
459
+ doc = Document(
460
+ source_path=f"github://{repo.full_name}/wiki/{md_file.stem}",
461
+ source_type="github-wiki",
462
+ title=title,
463
+ content=content,
464
+ metadata=DocumentMetadata(
465
+ project=repo.name,
466
+ extra={
467
+ "repo": repo.full_name,
468
+ "page": md_file.stem,
469
+ },
470
+ ),
471
+ )
472
+ documents.append(doc)
473
+ count += 1
474
+ except Exception as e:
475
+ print(f" Error reading {md_file.name}: {e}", file=sys.stderr)
476
+
477
+ print(f" Synced {count} wiki pages", file=sys.stderr)
478
+
479
+ except CalledProcessError as e:
480
+ print(f" Error cloning wiki: {e}", file=sys.stderr)
481
+ except Exception as e:
482
+ print(f" Error syncing wiki: {e}", file=sys.stderr)
483
+
484
+ return documents