devrel-origin 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devrel_origin/__init__.py +15 -0
- devrel_origin/cli/__init__.py +92 -0
- devrel_origin/cli/_common.py +243 -0
- devrel_origin/cli/analytics.py +28 -0
- devrel_origin/cli/argus.py +497 -0
- devrel_origin/cli/auth.py +227 -0
- devrel_origin/cli/config.py +108 -0
- devrel_origin/cli/content.py +259 -0
- devrel_origin/cli/cost.py +108 -0
- devrel_origin/cli/cro.py +298 -0
- devrel_origin/cli/deliverables.py +65 -0
- devrel_origin/cli/docs.py +91 -0
- devrel_origin/cli/doctor.py +178 -0
- devrel_origin/cli/experiment.py +29 -0
- devrel_origin/cli/growth.py +97 -0
- devrel_origin/cli/init.py +472 -0
- devrel_origin/cli/intel.py +27 -0
- devrel_origin/cli/kb.py +96 -0
- devrel_origin/cli/listen.py +31 -0
- devrel_origin/cli/marketing.py +66 -0
- devrel_origin/cli/migrate.py +45 -0
- devrel_origin/cli/run.py +46 -0
- devrel_origin/cli/sales.py +57 -0
- devrel_origin/cli/schedule.py +62 -0
- devrel_origin/cli/synthesize.py +28 -0
- devrel_origin/cli/triage.py +29 -0
- devrel_origin/cli/video.py +35 -0
- devrel_origin/core/__init__.py +58 -0
- devrel_origin/core/agent_config.py +75 -0
- devrel_origin/core/argus.py +964 -0
- devrel_origin/core/atlas.py +1450 -0
- devrel_origin/core/base.py +372 -0
- devrel_origin/core/cyra.py +563 -0
- devrel_origin/core/dex.py +708 -0
- devrel_origin/core/echo.py +614 -0
- devrel_origin/core/growth/__init__.py +27 -0
- devrel_origin/core/growth/recommendations.py +219 -0
- devrel_origin/core/growth/target_kinds.py +51 -0
- devrel_origin/core/iris.py +513 -0
- devrel_origin/core/kai.py +1367 -0
- devrel_origin/core/llm.py +542 -0
- devrel_origin/core/llm_backends.py +274 -0
- devrel_origin/core/mox.py +514 -0
- devrel_origin/core/nova.py +349 -0
- devrel_origin/core/pax.py +1205 -0
- devrel_origin/core/rex.py +532 -0
- devrel_origin/core/sage.py +486 -0
- devrel_origin/core/sentinel.py +385 -0
- devrel_origin/core/types.py +98 -0
- devrel_origin/core/video/__init__.py +22 -0
- devrel_origin/core/video/assembler.py +131 -0
- devrel_origin/core/video/browser_recorder.py +118 -0
- devrel_origin/core/video/desktop_recorder.py +254 -0
- devrel_origin/core/video/overlay_renderer.py +143 -0
- devrel_origin/core/video/script_parser.py +147 -0
- devrel_origin/core/video/tts_engine.py +82 -0
- devrel_origin/core/vox.py +268 -0
- devrel_origin/core/watchdog.py +321 -0
- devrel_origin/project/__init__.py +1 -0
- devrel_origin/project/config.py +75 -0
- devrel_origin/project/cost_sink.py +61 -0
- devrel_origin/project/init.py +104 -0
- devrel_origin/project/paths.py +75 -0
- devrel_origin/project/state.py +241 -0
- devrel_origin/project/templates/__init__.py +4 -0
- devrel_origin/project/templates/config.toml +24 -0
- devrel_origin/project/templates/devrel.gitignore +10 -0
- devrel_origin/project/templates/slop-blocklist.md +45 -0
- devrel_origin/project/templates/style.md +24 -0
- devrel_origin/project/templates/voice.md +29 -0
- devrel_origin/quality/__init__.py +66 -0
- devrel_origin/quality/editorial.py +357 -0
- devrel_origin/quality/persona.py +84 -0
- devrel_origin/quality/readability.py +148 -0
- devrel_origin/quality/slop.py +167 -0
- devrel_origin/quality/style.py +110 -0
- devrel_origin/quality/voice.py +15 -0
- devrel_origin/tools/__init__.py +9 -0
- devrel_origin/tools/analytics.py +304 -0
- devrel_origin/tools/api_client.py +393 -0
- devrel_origin/tools/apollo_client.py +305 -0
- devrel_origin/tools/code_validator.py +428 -0
- devrel_origin/tools/github_tools.py +297 -0
- devrel_origin/tools/instantly_client.py +412 -0
- devrel_origin/tools/kb_harvester.py +340 -0
- devrel_origin/tools/mcp_server.py +578 -0
- devrel_origin/tools/notifications.py +245 -0
- devrel_origin/tools/run_report.py +193 -0
- devrel_origin/tools/scheduler.py +231 -0
- devrel_origin/tools/search_tools.py +321 -0
- devrel_origin/tools/self_improve.py +168 -0
- devrel_origin/tools/sheets.py +236 -0
- devrel_origin-0.2.14.dist-info/METADATA +354 -0
- devrel_origin-0.2.14.dist-info/RECORD +98 -0
- devrel_origin-0.2.14.dist-info/WHEEL +5 -0
- devrel_origin-0.2.14.dist-info/entry_points.txt +2 -0
- devrel_origin-0.2.14.dist-info/licenses/LICENSE +21 -0
- devrel_origin-0.2.14.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""
|
|
2
|
+
KB Harvester — Automatic knowledge base population from public content.
|
|
3
|
+
|
|
4
|
+
Scrapes public content sources (website, Substack, LinkedIn, GitHub README)
|
|
5
|
+
and converts them into markdown files for the knowledge base.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class HarvestSource:
|
|
23
|
+
"""A content source to harvest."""
|
|
24
|
+
|
|
25
|
+
name: str
|
|
26
|
+
url: str
|
|
27
|
+
source_type: str # "website", "substack", "github", "sitemap"
|
|
28
|
+
category: str # KB subdirectory (e.g., "about", "blog", "docs")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class HarvestedDoc:
|
|
33
|
+
"""A single harvested document."""
|
|
34
|
+
|
|
35
|
+
title: str
|
|
36
|
+
source_url: str
|
|
37
|
+
content: str
|
|
38
|
+
category: str
|
|
39
|
+
filename: str # Sanitized filename for KB
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
_product_url = os.getenv("PRODUCT_URL", "https://openclaw.ai")
|
|
43
|
+
_github_repo = os.getenv("GITHUB_REPO", "openclaw/openclaw")
|
|
44
|
+
|
|
45
|
+
DEFAULT_SOURCES: list[dict[str, str]] = [
|
|
46
|
+
{
|
|
47
|
+
"name": "Website Homepage",
|
|
48
|
+
"url": _product_url,
|
|
49
|
+
"source_type": "website",
|
|
50
|
+
"category": "about",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"name": "GitHub README",
|
|
54
|
+
"url": f"https://raw.githubusercontent.com/{_github_repo}/main/README.md",
|
|
55
|
+
"source_type": "github",
|
|
56
|
+
"category": "docs",
|
|
57
|
+
},
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class KBHarvester:
|
|
62
|
+
"""Harvests public content into the knowledge base.
|
|
63
|
+
|
|
64
|
+
Usage::
|
|
65
|
+
|
|
66
|
+
harvester = KBHarvester(kb_path, firecrawl_api_key="fc-...")
|
|
67
|
+
report = await harvester.harvest_all()
|
|
68
|
+
# Or harvest specific URLs:
|
|
69
|
+
doc = await harvester.harvest_url("https://example.com/blog/post", "blog")
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
FIRECRAWL_API = "https://api.firecrawl.dev/v1"
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
kb_path: Path,
|
|
77
|
+
firecrawl_api_key: str = "",
|
|
78
|
+
sources: list[dict[str, str]] | None = None,
|
|
79
|
+
):
|
|
80
|
+
self.kb_path = Path(kb_path)
|
|
81
|
+
self.firecrawl_api_key = firecrawl_api_key
|
|
82
|
+
self.sources = [HarvestSource(**s) for s in (sources or DEFAULT_SOURCES)]
|
|
83
|
+
self._client = httpx.AsyncClient(timeout=30.0)
|
|
84
|
+
|
|
85
|
+
async def close(self) -> None:
|
|
86
|
+
await self._client.aclose()
|
|
87
|
+
|
|
88
|
+
async def harvest_all(self) -> dict[str, Any]:
|
|
89
|
+
"""Harvest all configured sources in parallel."""
|
|
90
|
+
tasks = [self._harvest_source(s) for s in self.sources]
|
|
91
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
92
|
+
|
|
93
|
+
report: dict[str, Any] = {"harvested": 0, "failed": 0, "sources": []}
|
|
94
|
+
for source, result in zip(self.sources, results, strict=True):
|
|
95
|
+
if isinstance(result, Exception):
|
|
96
|
+
report["failed"] += 1
|
|
97
|
+
report["sources"].append(
|
|
98
|
+
{
|
|
99
|
+
"name": source.name,
|
|
100
|
+
"status": "failed",
|
|
101
|
+
"error": str(result),
|
|
102
|
+
}
|
|
103
|
+
)
|
|
104
|
+
elif result:
|
|
105
|
+
report["harvested"] += 1
|
|
106
|
+
report["sources"].append(
|
|
107
|
+
{
|
|
108
|
+
"name": source.name,
|
|
109
|
+
"status": "ok",
|
|
110
|
+
"file": result.filename,
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
else:
|
|
114
|
+
report["failed"] += 1
|
|
115
|
+
report["sources"].append(
|
|
116
|
+
{
|
|
117
|
+
"name": source.name,
|
|
118
|
+
"status": "empty",
|
|
119
|
+
}
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
logger.info(f"Harvest complete: {report['harvested']} OK, {report['failed']} failed")
|
|
123
|
+
return report
|
|
124
|
+
|
|
125
|
+
async def _harvest_source(self, source: HarvestSource) -> HarvestedDoc | None:
|
|
126
|
+
"""Harvest a single source."""
|
|
127
|
+
if source.source_type == "github":
|
|
128
|
+
return await self._harvest_raw_url(source)
|
|
129
|
+
elif source.source_type in ("website", "substack"):
|
|
130
|
+
return await self._harvest_web_page(source)
|
|
131
|
+
elif source.source_type == "sitemap":
|
|
132
|
+
return await self._harvest_sitemap(source)
|
|
133
|
+
else:
|
|
134
|
+
logger.warning(f"Unknown source type: {source.source_type}")
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
async def _harvest_raw_url(self, source: HarvestSource) -> HarvestedDoc | None:
|
|
138
|
+
"""Fetch raw content (e.g., GitHub raw files)."""
|
|
139
|
+
try:
|
|
140
|
+
resp = await self._client.get(source.url, follow_redirects=True)
|
|
141
|
+
resp.raise_for_status()
|
|
142
|
+
content = resp.text
|
|
143
|
+
|
|
144
|
+
doc = HarvestedDoc(
|
|
145
|
+
title=source.name,
|
|
146
|
+
source_url=source.url,
|
|
147
|
+
content=content,
|
|
148
|
+
category=source.category,
|
|
149
|
+
filename=self._sanitize_filename(source.name) + ".md",
|
|
150
|
+
)
|
|
151
|
+
self._save_doc(doc)
|
|
152
|
+
return doc
|
|
153
|
+
except Exception as exc:
|
|
154
|
+
logger.warning(f"Failed to harvest {source.url}: {exc}")
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
async def _harvest_web_page(self, source: HarvestSource) -> HarvestedDoc | None:
|
|
158
|
+
"""Scrape a web page, preferring Firecrawl for clean markdown."""
|
|
159
|
+
content = ""
|
|
160
|
+
|
|
161
|
+
# Try Firecrawl first
|
|
162
|
+
if self.firecrawl_api_key:
|
|
163
|
+
try:
|
|
164
|
+
resp = await self._client.post(
|
|
165
|
+
f"{self.FIRECRAWL_API}/scrape",
|
|
166
|
+
headers={
|
|
167
|
+
"Authorization": f"Bearer {self.firecrawl_api_key}",
|
|
168
|
+
"Content-Type": "application/json",
|
|
169
|
+
},
|
|
170
|
+
json={"url": source.url, "formats": ["markdown"]},
|
|
171
|
+
)
|
|
172
|
+
resp.raise_for_status()
|
|
173
|
+
data = resp.json()
|
|
174
|
+
if data.get("success"):
|
|
175
|
+
content = data.get("data", {}).get("markdown", "")
|
|
176
|
+
except Exception as exc:
|
|
177
|
+
logger.warning(f"Firecrawl scrape failed for {source.url}: {exc}")
|
|
178
|
+
|
|
179
|
+
# Fallback: direct fetch with HTML stripping
|
|
180
|
+
if not content:
|
|
181
|
+
try:
|
|
182
|
+
resp = await self._client.get(
|
|
183
|
+
source.url,
|
|
184
|
+
follow_redirects=True,
|
|
185
|
+
headers={"User-Agent": "DevRelOrigin/1.0"},
|
|
186
|
+
)
|
|
187
|
+
resp.raise_for_status()
|
|
188
|
+
content = self._strip_html(resp.text)
|
|
189
|
+
except Exception as exc:
|
|
190
|
+
logger.warning(f"Direct fetch failed for {source.url}: {exc}")
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
if not content or len(content) < 50:
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
title = self._extract_title(content) or source.name
|
|
197
|
+
doc = HarvestedDoc(
|
|
198
|
+
title=title,
|
|
199
|
+
source_url=source.url,
|
|
200
|
+
content=f"# {title}\n\n> Source: {source.url}\n\n{content}",
|
|
201
|
+
category=source.category,
|
|
202
|
+
filename=self._sanitize_filename(title) + ".md",
|
|
203
|
+
)
|
|
204
|
+
self._save_doc(doc)
|
|
205
|
+
return doc
|
|
206
|
+
|
|
207
|
+
async def _harvest_sitemap(self, source: HarvestSource) -> HarvestedDoc | None:
|
|
208
|
+
"""Parse a sitemap and harvest linked pages."""
|
|
209
|
+
try:
|
|
210
|
+
resp = await self._client.get(source.url, follow_redirects=True)
|
|
211
|
+
resp.raise_for_status()
|
|
212
|
+
|
|
213
|
+
# Extract URLs from sitemap XML
|
|
214
|
+
urls = re.findall(r"<loc>(.*?)</loc>", resp.text)
|
|
215
|
+
if not urls:
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
# Harvest first 20 pages
|
|
219
|
+
pages: list[str] = []
|
|
220
|
+
for url in urls[:20]:
|
|
221
|
+
sub_source = HarvestSource(
|
|
222
|
+
name=url.split("/")[-1] or "page",
|
|
223
|
+
url=url,
|
|
224
|
+
source_type="website",
|
|
225
|
+
category=source.category,
|
|
226
|
+
)
|
|
227
|
+
doc = await self._harvest_web_page(sub_source)
|
|
228
|
+
if doc:
|
|
229
|
+
pages.append(doc.filename)
|
|
230
|
+
|
|
231
|
+
logger.info(f"Sitemap: harvested {len(pages)}/{len(urls)} pages")
|
|
232
|
+
# Return a summary doc
|
|
233
|
+
return HarvestedDoc(
|
|
234
|
+
title=f"Sitemap: {source.name}",
|
|
235
|
+
source_url=source.url,
|
|
236
|
+
content=f"Harvested {len(pages)} pages from sitemap.",
|
|
237
|
+
category=source.category,
|
|
238
|
+
filename="sitemap-index.md",
|
|
239
|
+
)
|
|
240
|
+
except Exception as exc:
|
|
241
|
+
logger.warning(f"Sitemap harvest failed: {exc}")
|
|
242
|
+
return None
|
|
243
|
+
|
|
244
|
+
async def harvest_url(self, url: str, category: str = "misc") -> HarvestedDoc | None:
|
|
245
|
+
"""Harvest a single URL into the KB."""
|
|
246
|
+
source = HarvestSource(
|
|
247
|
+
name=url.split("/")[-1] or "page",
|
|
248
|
+
url=url,
|
|
249
|
+
source_type="website",
|
|
250
|
+
category=category,
|
|
251
|
+
)
|
|
252
|
+
return await self._harvest_web_page(source)
|
|
253
|
+
|
|
254
|
+
def _save_doc(self, doc: HarvestedDoc) -> None:
|
|
255
|
+
"""Save a harvested document to the knowledge base."""
|
|
256
|
+
category_dir = self.kb_path / doc.category
|
|
257
|
+
category_dir.mkdir(parents=True, exist_ok=True)
|
|
258
|
+
|
|
259
|
+
filepath = category_dir / doc.filename
|
|
260
|
+
filepath.write_text(doc.content, encoding="utf-8")
|
|
261
|
+
logger.info(f"Saved KB doc: {filepath}")
|
|
262
|
+
|
|
263
|
+
@staticmethod
|
|
264
|
+
def _sanitize_filename(text: str) -> str:
|
|
265
|
+
"""Convert text to a safe filename."""
|
|
266
|
+
clean = re.sub(r"[^\w\s-]", "", text.lower())
|
|
267
|
+
clean = re.sub(r"[\s_]+", "-", clean)
|
|
268
|
+
return clean[:80].strip("-")
|
|
269
|
+
|
|
270
|
+
@staticmethod
|
|
271
|
+
def _extract_title(content: str) -> str:
|
|
272
|
+
"""Extract title from markdown/text content."""
|
|
273
|
+
for line in content.split("\n")[:10]:
|
|
274
|
+
line = line.strip()
|
|
275
|
+
if line.startswith("# "):
|
|
276
|
+
return line[2:].strip()
|
|
277
|
+
if len(line) > 10 and not line.startswith(("http", "<", "!")):
|
|
278
|
+
return line[:100]
|
|
279
|
+
return ""
|
|
280
|
+
|
|
281
|
+
@staticmethod
|
|
282
|
+
def _strip_html(html: str) -> str:
|
|
283
|
+
"""Crude HTML → text conversion."""
|
|
284
|
+
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL)
|
|
285
|
+
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL)
|
|
286
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
287
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
288
|
+
return text
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
async def main() -> None:
|
|
292
|
+
"""CLI entry point for KB harvesting."""
|
|
293
|
+
import argparse
|
|
294
|
+
import os
|
|
295
|
+
|
|
296
|
+
from dotenv import load_dotenv
|
|
297
|
+
|
|
298
|
+
load_dotenv()
|
|
299
|
+
|
|
300
|
+
parser = argparse.ArgumentParser(description="Harvest content into knowledge base")
|
|
301
|
+
parser.add_argument("--kb-path", default="knowledge_base", help="KB directory")
|
|
302
|
+
parser.add_argument("--url", help="Single URL to harvest")
|
|
303
|
+
parser.add_argument("--category", default="misc", help="KB category for --url")
|
|
304
|
+
parser.add_argument(
|
|
305
|
+
"--sources-file",
|
|
306
|
+
help="JSON file with custom harvest sources",
|
|
307
|
+
)
|
|
308
|
+
args = parser.parse_args()
|
|
309
|
+
|
|
310
|
+
sources = None
|
|
311
|
+
if args.sources_file:
|
|
312
|
+
import json
|
|
313
|
+
|
|
314
|
+
sources = json.loads(Path(args.sources_file).read_text())
|
|
315
|
+
|
|
316
|
+
harvester = KBHarvester(
|
|
317
|
+
kb_path=Path(args.kb_path),
|
|
318
|
+
firecrawl_api_key=os.environ.get("FIRECRAWL_API_KEY", ""),
|
|
319
|
+
sources=sources,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
if args.url:
|
|
324
|
+
doc = await harvester.harvest_url(args.url, args.category)
|
|
325
|
+
if doc:
|
|
326
|
+
print(f"Harvested: {doc.filename} ({len(doc.content)} chars)")
|
|
327
|
+
else:
|
|
328
|
+
print("Failed to harvest URL")
|
|
329
|
+
else:
|
|
330
|
+
report = await harvester.harvest_all()
|
|
331
|
+
print(f"Harvested: {report['harvested']}, Failed: {report['failed']}")
|
|
332
|
+
for s in report["sources"]:
|
|
333
|
+
status = "✓" if s["status"] == "ok" else "✗"
|
|
334
|
+
print(f" [{status}] {s['name']}: {s.get('file', s.get('error', ''))}")
|
|
335
|
+
finally:
|
|
336
|
+
await harvester.close()
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
if __name__ == "__main__":
|
|
340
|
+
asyncio.run(main())
|