devrel-origin 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. devrel_origin/__init__.py +15 -0
  2. devrel_origin/cli/__init__.py +92 -0
  3. devrel_origin/cli/_common.py +243 -0
  4. devrel_origin/cli/analytics.py +28 -0
  5. devrel_origin/cli/argus.py +497 -0
  6. devrel_origin/cli/auth.py +227 -0
  7. devrel_origin/cli/config.py +108 -0
  8. devrel_origin/cli/content.py +259 -0
  9. devrel_origin/cli/cost.py +108 -0
  10. devrel_origin/cli/cro.py +298 -0
  11. devrel_origin/cli/deliverables.py +65 -0
  12. devrel_origin/cli/docs.py +91 -0
  13. devrel_origin/cli/doctor.py +178 -0
  14. devrel_origin/cli/experiment.py +29 -0
  15. devrel_origin/cli/growth.py +97 -0
  16. devrel_origin/cli/init.py +472 -0
  17. devrel_origin/cli/intel.py +27 -0
  18. devrel_origin/cli/kb.py +96 -0
  19. devrel_origin/cli/listen.py +31 -0
  20. devrel_origin/cli/marketing.py +66 -0
  21. devrel_origin/cli/migrate.py +45 -0
  22. devrel_origin/cli/run.py +46 -0
  23. devrel_origin/cli/sales.py +57 -0
  24. devrel_origin/cli/schedule.py +62 -0
  25. devrel_origin/cli/synthesize.py +28 -0
  26. devrel_origin/cli/triage.py +29 -0
  27. devrel_origin/cli/video.py +35 -0
  28. devrel_origin/core/__init__.py +58 -0
  29. devrel_origin/core/agent_config.py +75 -0
  30. devrel_origin/core/argus.py +964 -0
  31. devrel_origin/core/atlas.py +1450 -0
  32. devrel_origin/core/base.py +372 -0
  33. devrel_origin/core/cyra.py +563 -0
  34. devrel_origin/core/dex.py +708 -0
  35. devrel_origin/core/echo.py +614 -0
  36. devrel_origin/core/growth/__init__.py +27 -0
  37. devrel_origin/core/growth/recommendations.py +219 -0
  38. devrel_origin/core/growth/target_kinds.py +51 -0
  39. devrel_origin/core/iris.py +513 -0
  40. devrel_origin/core/kai.py +1367 -0
  41. devrel_origin/core/llm.py +542 -0
  42. devrel_origin/core/llm_backends.py +274 -0
  43. devrel_origin/core/mox.py +514 -0
  44. devrel_origin/core/nova.py +349 -0
  45. devrel_origin/core/pax.py +1205 -0
  46. devrel_origin/core/rex.py +532 -0
  47. devrel_origin/core/sage.py +486 -0
  48. devrel_origin/core/sentinel.py +385 -0
  49. devrel_origin/core/types.py +98 -0
  50. devrel_origin/core/video/__init__.py +22 -0
  51. devrel_origin/core/video/assembler.py +131 -0
  52. devrel_origin/core/video/browser_recorder.py +118 -0
  53. devrel_origin/core/video/desktop_recorder.py +254 -0
  54. devrel_origin/core/video/overlay_renderer.py +143 -0
  55. devrel_origin/core/video/script_parser.py +147 -0
  56. devrel_origin/core/video/tts_engine.py +82 -0
  57. devrel_origin/core/vox.py +268 -0
  58. devrel_origin/core/watchdog.py +321 -0
  59. devrel_origin/project/__init__.py +1 -0
  60. devrel_origin/project/config.py +75 -0
  61. devrel_origin/project/cost_sink.py +61 -0
  62. devrel_origin/project/init.py +104 -0
  63. devrel_origin/project/paths.py +75 -0
  64. devrel_origin/project/state.py +241 -0
  65. devrel_origin/project/templates/__init__.py +4 -0
  66. devrel_origin/project/templates/config.toml +24 -0
  67. devrel_origin/project/templates/devrel.gitignore +10 -0
  68. devrel_origin/project/templates/slop-blocklist.md +45 -0
  69. devrel_origin/project/templates/style.md +24 -0
  70. devrel_origin/project/templates/voice.md +29 -0
  71. devrel_origin/quality/__init__.py +66 -0
  72. devrel_origin/quality/editorial.py +357 -0
  73. devrel_origin/quality/persona.py +84 -0
  74. devrel_origin/quality/readability.py +148 -0
  75. devrel_origin/quality/slop.py +167 -0
  76. devrel_origin/quality/style.py +110 -0
  77. devrel_origin/quality/voice.py +15 -0
  78. devrel_origin/tools/__init__.py +9 -0
  79. devrel_origin/tools/analytics.py +304 -0
  80. devrel_origin/tools/api_client.py +393 -0
  81. devrel_origin/tools/apollo_client.py +305 -0
  82. devrel_origin/tools/code_validator.py +428 -0
  83. devrel_origin/tools/github_tools.py +297 -0
  84. devrel_origin/tools/instantly_client.py +412 -0
  85. devrel_origin/tools/kb_harvester.py +340 -0
  86. devrel_origin/tools/mcp_server.py +578 -0
  87. devrel_origin/tools/notifications.py +245 -0
  88. devrel_origin/tools/run_report.py +193 -0
  89. devrel_origin/tools/scheduler.py +231 -0
  90. devrel_origin/tools/search_tools.py +321 -0
  91. devrel_origin/tools/self_improve.py +168 -0
  92. devrel_origin/tools/sheets.py +236 -0
  93. devrel_origin-0.2.14.dist-info/METADATA +354 -0
  94. devrel_origin-0.2.14.dist-info/RECORD +98 -0
  95. devrel_origin-0.2.14.dist-info/WHEEL +5 -0
  96. devrel_origin-0.2.14.dist-info/entry_points.txt +2 -0
  97. devrel_origin-0.2.14.dist-info/licenses/LICENSE +21 -0
  98. devrel_origin-0.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,340 @@
1
+ """
2
+ KB Harvester — Automatic knowledge base population from public content.
3
+
4
+ Scrapes public content sources (website, Substack, LinkedIn, GitHub README)
5
+ and converts them into markdown files for the knowledge base.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import os
11
+ import re
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ import httpx
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class HarvestSource:
23
+ """A content source to harvest."""
24
+
25
+ name: str
26
+ url: str
27
+ source_type: str # "website", "substack", "github", "sitemap"
28
+ category: str # KB subdirectory (e.g., "about", "blog", "docs")
29
+
30
+
31
+ @dataclass
32
+ class HarvestedDoc:
33
+ """A single harvested document."""
34
+
35
+ title: str
36
+ source_url: str
37
+ content: str
38
+ category: str
39
+ filename: str # Sanitized filename for KB
40
+
41
+
42
+ _product_url = os.getenv("PRODUCT_URL", "https://openclaw.ai")
43
+ _github_repo = os.getenv("GITHUB_REPO", "openclaw/openclaw")
44
+
45
+ DEFAULT_SOURCES: list[dict[str, str]] = [
46
+ {
47
+ "name": "Website Homepage",
48
+ "url": _product_url,
49
+ "source_type": "website",
50
+ "category": "about",
51
+ },
52
+ {
53
+ "name": "GitHub README",
54
+ "url": f"https://raw.githubusercontent.com/{_github_repo}/main/README.md",
55
+ "source_type": "github",
56
+ "category": "docs",
57
+ },
58
+ ]
59
+
60
+
61
+ class KBHarvester:
62
+ """Harvests public content into the knowledge base.
63
+
64
+ Usage::
65
+
66
+ harvester = KBHarvester(kb_path, firecrawl_api_key="fc-...")
67
+ report = await harvester.harvest_all()
68
+ # Or harvest specific URLs:
69
+ doc = await harvester.harvest_url("https://example.com/blog/post", "blog")
70
+ """
71
+
72
+ FIRECRAWL_API = "https://api.firecrawl.dev/v1"
73
+
74
+ def __init__(
75
+ self,
76
+ kb_path: Path,
77
+ firecrawl_api_key: str = "",
78
+ sources: list[dict[str, str]] | None = None,
79
+ ):
80
+ self.kb_path = Path(kb_path)
81
+ self.firecrawl_api_key = firecrawl_api_key
82
+ self.sources = [HarvestSource(**s) for s in (sources or DEFAULT_SOURCES)]
83
+ self._client = httpx.AsyncClient(timeout=30.0)
84
+
85
+ async def close(self) -> None:
86
+ await self._client.aclose()
87
+
88
+ async def harvest_all(self) -> dict[str, Any]:
89
+ """Harvest all configured sources in parallel."""
90
+ tasks = [self._harvest_source(s) for s in self.sources]
91
+ results = await asyncio.gather(*tasks, return_exceptions=True)
92
+
93
+ report: dict[str, Any] = {"harvested": 0, "failed": 0, "sources": []}
94
+ for source, result in zip(self.sources, results, strict=True):
95
+ if isinstance(result, Exception):
96
+ report["failed"] += 1
97
+ report["sources"].append(
98
+ {
99
+ "name": source.name,
100
+ "status": "failed",
101
+ "error": str(result),
102
+ }
103
+ )
104
+ elif result:
105
+ report["harvested"] += 1
106
+ report["sources"].append(
107
+ {
108
+ "name": source.name,
109
+ "status": "ok",
110
+ "file": result.filename,
111
+ }
112
+ )
113
+ else:
114
+ report["failed"] += 1
115
+ report["sources"].append(
116
+ {
117
+ "name": source.name,
118
+ "status": "empty",
119
+ }
120
+ )
121
+
122
+ logger.info(f"Harvest complete: {report['harvested']} OK, {report['failed']} failed")
123
+ return report
124
+
125
+ async def _harvest_source(self, source: HarvestSource) -> HarvestedDoc | None:
126
+ """Harvest a single source."""
127
+ if source.source_type == "github":
128
+ return await self._harvest_raw_url(source)
129
+ elif source.source_type in ("website", "substack"):
130
+ return await self._harvest_web_page(source)
131
+ elif source.source_type == "sitemap":
132
+ return await self._harvest_sitemap(source)
133
+ else:
134
+ logger.warning(f"Unknown source type: {source.source_type}")
135
+ return None
136
+
137
+ async def _harvest_raw_url(self, source: HarvestSource) -> HarvestedDoc | None:
138
+ """Fetch raw content (e.g., GitHub raw files)."""
139
+ try:
140
+ resp = await self._client.get(source.url, follow_redirects=True)
141
+ resp.raise_for_status()
142
+ content = resp.text
143
+
144
+ doc = HarvestedDoc(
145
+ title=source.name,
146
+ source_url=source.url,
147
+ content=content,
148
+ category=source.category,
149
+ filename=self._sanitize_filename(source.name) + ".md",
150
+ )
151
+ self._save_doc(doc)
152
+ return doc
153
+ except Exception as exc:
154
+ logger.warning(f"Failed to harvest {source.url}: {exc}")
155
+ return None
156
+
157
+ async def _harvest_web_page(self, source: HarvestSource) -> HarvestedDoc | None:
158
+ """Scrape a web page, preferring Firecrawl for clean markdown."""
159
+ content = ""
160
+
161
+ # Try Firecrawl first
162
+ if self.firecrawl_api_key:
163
+ try:
164
+ resp = await self._client.post(
165
+ f"{self.FIRECRAWL_API}/scrape",
166
+ headers={
167
+ "Authorization": f"Bearer {self.firecrawl_api_key}",
168
+ "Content-Type": "application/json",
169
+ },
170
+ json={"url": source.url, "formats": ["markdown"]},
171
+ )
172
+ resp.raise_for_status()
173
+ data = resp.json()
174
+ if data.get("success"):
175
+ content = data.get("data", {}).get("markdown", "")
176
+ except Exception as exc:
177
+ logger.warning(f"Firecrawl scrape failed for {source.url}: {exc}")
178
+
179
+ # Fallback: direct fetch with HTML stripping
180
+ if not content:
181
+ try:
182
+ resp = await self._client.get(
183
+ source.url,
184
+ follow_redirects=True,
185
+ headers={"User-Agent": "DevRelOrigin/1.0"},
186
+ )
187
+ resp.raise_for_status()
188
+ content = self._strip_html(resp.text)
189
+ except Exception as exc:
190
+ logger.warning(f"Direct fetch failed for {source.url}: {exc}")
191
+ return None
192
+
193
+ if not content or len(content) < 50:
194
+ return None
195
+
196
+ title = self._extract_title(content) or source.name
197
+ doc = HarvestedDoc(
198
+ title=title,
199
+ source_url=source.url,
200
+ content=f"# {title}\n\n> Source: {source.url}\n\n{content}",
201
+ category=source.category,
202
+ filename=self._sanitize_filename(title) + ".md",
203
+ )
204
+ self._save_doc(doc)
205
+ return doc
206
+
207
+ async def _harvest_sitemap(self, source: HarvestSource) -> HarvestedDoc | None:
208
+ """Parse a sitemap and harvest linked pages."""
209
+ try:
210
+ resp = await self._client.get(source.url, follow_redirects=True)
211
+ resp.raise_for_status()
212
+
213
+ # Extract URLs from sitemap XML
214
+ urls = re.findall(r"<loc>(.*?)</loc>", resp.text)
215
+ if not urls:
216
+ return None
217
+
218
+ # Harvest first 20 pages
219
+ pages: list[str] = []
220
+ for url in urls[:20]:
221
+ sub_source = HarvestSource(
222
+ name=url.split("/")[-1] or "page",
223
+ url=url,
224
+ source_type="website",
225
+ category=source.category,
226
+ )
227
+ doc = await self._harvest_web_page(sub_source)
228
+ if doc:
229
+ pages.append(doc.filename)
230
+
231
+ logger.info(f"Sitemap: harvested {len(pages)}/{len(urls)} pages")
232
+ # Return a summary doc
233
+ return HarvestedDoc(
234
+ title=f"Sitemap: {source.name}",
235
+ source_url=source.url,
236
+ content=f"Harvested {len(pages)} pages from sitemap.",
237
+ category=source.category,
238
+ filename="sitemap-index.md",
239
+ )
240
+ except Exception as exc:
241
+ logger.warning(f"Sitemap harvest failed: {exc}")
242
+ return None
243
+
244
+ async def harvest_url(self, url: str, category: str = "misc") -> HarvestedDoc | None:
245
+ """Harvest a single URL into the KB."""
246
+ source = HarvestSource(
247
+ name=url.split("/")[-1] or "page",
248
+ url=url,
249
+ source_type="website",
250
+ category=category,
251
+ )
252
+ return await self._harvest_web_page(source)
253
+
254
+ def _save_doc(self, doc: HarvestedDoc) -> None:
255
+ """Save a harvested document to the knowledge base."""
256
+ category_dir = self.kb_path / doc.category
257
+ category_dir.mkdir(parents=True, exist_ok=True)
258
+
259
+ filepath = category_dir / doc.filename
260
+ filepath.write_text(doc.content, encoding="utf-8")
261
+ logger.info(f"Saved KB doc: {filepath}")
262
+
263
+ @staticmethod
264
+ def _sanitize_filename(text: str) -> str:
265
+ """Convert text to a safe filename."""
266
+ clean = re.sub(r"[^\w\s-]", "", text.lower())
267
+ clean = re.sub(r"[\s_]+", "-", clean)
268
+ return clean[:80].strip("-")
269
+
270
+ @staticmethod
271
+ def _extract_title(content: str) -> str:
272
+ """Extract title from markdown/text content."""
273
+ for line in content.split("\n")[:10]:
274
+ line = line.strip()
275
+ if line.startswith("# "):
276
+ return line[2:].strip()
277
+ if len(line) > 10 and not line.startswith(("http", "<", "!")):
278
+ return line[:100]
279
+ return ""
280
+
281
+ @staticmethod
282
+ def _strip_html(html: str) -> str:
283
+ """Crude HTML → text conversion."""
284
+ text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL)
285
+ text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL)
286
+ text = re.sub(r"<[^>]+>", " ", text)
287
+ text = re.sub(r"\s+", " ", text).strip()
288
+ return text
289
+
290
+
291
+ async def main() -> None:
292
+ """CLI entry point for KB harvesting."""
293
+ import argparse
294
+ import os
295
+
296
+ from dotenv import load_dotenv
297
+
298
+ load_dotenv()
299
+
300
+ parser = argparse.ArgumentParser(description="Harvest content into knowledge base")
301
+ parser.add_argument("--kb-path", default="knowledge_base", help="KB directory")
302
+ parser.add_argument("--url", help="Single URL to harvest")
303
+ parser.add_argument("--category", default="misc", help="KB category for --url")
304
+ parser.add_argument(
305
+ "--sources-file",
306
+ help="JSON file with custom harvest sources",
307
+ )
308
+ args = parser.parse_args()
309
+
310
+ sources = None
311
+ if args.sources_file:
312
+ import json
313
+
314
+ sources = json.loads(Path(args.sources_file).read_text())
315
+
316
+ harvester = KBHarvester(
317
+ kb_path=Path(args.kb_path),
318
+ firecrawl_api_key=os.environ.get("FIRECRAWL_API_KEY", ""),
319
+ sources=sources,
320
+ )
321
+
322
+ try:
323
+ if args.url:
324
+ doc = await harvester.harvest_url(args.url, args.category)
325
+ if doc:
326
+ print(f"Harvested: {doc.filename} ({len(doc.content)} chars)")
327
+ else:
328
+ print("Failed to harvest URL")
329
+ else:
330
+ report = await harvester.harvest_all()
331
+ print(f"Harvested: {report['harvested']}, Failed: {report['failed']}")
332
+ for s in report["sources"]:
333
+ status = "✓" if s["status"] == "ok" else "✗"
334
+ print(f" [{status}] {s['name']}: {s.get('file', s.get('error', ''))}")
335
+ finally:
336
+ await harvester.close()
337
+
338
+
339
+ if __name__ == "__main__":
340
+ asyncio.run(main())