bluera-knowledge 0.18.2 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +19 -28
- package/dist/{chunk-NYRKKRRA.js → chunk-27Y4ENUD.js} +2 -2
- package/dist/{chunk-JSCOGKNU.js → chunk-EQYSYRQJ.js} +129 -50
- package/dist/chunk-EQYSYRQJ.js.map +1 -0
- package/dist/{chunk-YMSMKOMF.js → chunk-KQLTWB4T.js} +5 -112
- package/dist/{chunk-YMSMKOMF.js.map → chunk-KQLTWB4T.js.map} +1 -1
- package/dist/index.js +3 -3
- package/dist/mcp/server.d.ts +0 -29
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +2 -2
- package/package.json +3 -1
- package/python/ast_worker.py +209 -0
- package/dist/chunk-JSCOGKNU.js.map +0 -1
- package/python/crawl_worker.py +0 -280
- /package/dist/{chunk-NYRKKRRA.js.map → chunk-27Y4ENUD.js.map} +0 -0
package/python/crawl_worker.py
DELETED
|
@@ -1,280 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
import sys
|
|
3
|
-
import json
|
|
4
|
-
import asyncio
|
|
5
|
-
import os
|
|
6
|
-
import ast
|
|
7
|
-
from typing import List, Dict, Any
|
|
8
|
-
|
|
9
|
-
# Suppress crawl4ai logging before import
|
|
10
|
-
os.environ['CRAWL4AI_VERBOSE'] = '0'
|
|
11
|
-
|
|
12
|
-
# Redirect stderr to suppress logging (crawl4ai uses console for progress)
|
|
13
|
-
import io
|
|
14
|
-
sys.stderr = io.StringIO()
|
|
15
|
-
|
|
16
|
-
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
17
|
-
|
|
18
|
-
async def fetch_headless(url: str):
|
|
19
|
-
"""Fetch URL with headless browser (Playwright via crawl4ai)"""
|
|
20
|
-
browser_config = BrowserConfig(headless=True, verbose=False)
|
|
21
|
-
run_config = CrawlerRunConfig(
|
|
22
|
-
wait_for="js:() => document.readyState === 'complete'",
|
|
23
|
-
page_timeout=30000
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
async with AsyncWebCrawler(config=browser_config, verbose=False) as crawler:
|
|
27
|
-
result = await crawler.arun(url, config=run_config)
|
|
28
|
-
|
|
29
|
-
if not result.success:
|
|
30
|
-
raise Exception(f"Crawl failed: {result.error_message}")
|
|
31
|
-
|
|
32
|
-
# Combine internal and external links - let TypeScript filter by domain
|
|
33
|
-
all_links = []
|
|
34
|
-
if isinstance(result.links, dict):
|
|
35
|
-
all_links = result.links.get("internal", []) + result.links.get("external", [])
|
|
36
|
-
|
|
37
|
-
return {
|
|
38
|
-
"html": result.html or '',
|
|
39
|
-
"markdown": result.markdown or result.cleaned_html or '',
|
|
40
|
-
"links": all_links
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
def is_exported(node: ast.AST) -> bool:
|
|
44
|
-
"""Check if a function or class is exported (Python doesn't have explicit exports, check if starts with '_')"""
|
|
45
|
-
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
46
|
-
return not node.name.startswith('_')
|
|
47
|
-
return False
|
|
48
|
-
|
|
49
|
-
def get_signature(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
|
|
50
|
-
"""Extract function signature from AST node"""
|
|
51
|
-
args_list = []
|
|
52
|
-
|
|
53
|
-
for arg in node.args.args:
|
|
54
|
-
arg_str = arg.arg
|
|
55
|
-
if arg.annotation:
|
|
56
|
-
arg_str += f': {ast.unparse(arg.annotation)}'
|
|
57
|
-
args_list.append(arg_str)
|
|
58
|
-
|
|
59
|
-
return_annotation = ''
|
|
60
|
-
if node.returns:
|
|
61
|
-
return_annotation = f' -> {ast.unparse(node.returns)}'
|
|
62
|
-
|
|
63
|
-
return f"{node.name}({', '.join(args_list)}){return_annotation}"
|
|
64
|
-
|
|
65
|
-
def extract_imports(tree: ast.AST) -> List[Dict[str, Any]]:
|
|
66
|
-
"""Extract import statements from AST"""
|
|
67
|
-
imports = []
|
|
68
|
-
|
|
69
|
-
for node in ast.walk(tree):
|
|
70
|
-
if isinstance(node, ast.Import):
|
|
71
|
-
for alias in node.names:
|
|
72
|
-
imports.append({
|
|
73
|
-
'source': alias.name,
|
|
74
|
-
'imported': alias.asname if alias.asname else alias.name
|
|
75
|
-
})
|
|
76
|
-
elif isinstance(node, ast.ImportFrom):
|
|
77
|
-
module = node.module if node.module else ''
|
|
78
|
-
for alias in node.names:
|
|
79
|
-
imports.append({
|
|
80
|
-
'source': module,
|
|
81
|
-
'imported': alias.name,
|
|
82
|
-
'alias': alias.asname if alias.asname else None
|
|
83
|
-
})
|
|
84
|
-
|
|
85
|
-
return imports
|
|
86
|
-
|
|
87
|
-
def extract_calls(node: ast.AST) -> List[str]:
|
|
88
|
-
"""Extract function calls from a function/method body"""
|
|
89
|
-
calls = []
|
|
90
|
-
|
|
91
|
-
for child in ast.walk(node):
|
|
92
|
-
if isinstance(child, ast.Call):
|
|
93
|
-
if isinstance(child.func, ast.Name):
|
|
94
|
-
calls.append(child.func.id)
|
|
95
|
-
elif isinstance(child.func, ast.Attribute):
|
|
96
|
-
calls.append(child.func.attr)
|
|
97
|
-
|
|
98
|
-
return calls
|
|
99
|
-
|
|
100
|
-
async def parse_python_ast(code: str, file_path: str) -> Dict[str, Any]:
|
|
101
|
-
"""Parse Python code and return CodeNode array"""
|
|
102
|
-
try:
|
|
103
|
-
tree = ast.parse(code)
|
|
104
|
-
nodes = []
|
|
105
|
-
|
|
106
|
-
for node in tree.body:
|
|
107
|
-
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
108
|
-
nodes.append({
|
|
109
|
-
'type': 'function',
|
|
110
|
-
'name': node.name,
|
|
111
|
-
'exported': is_exported(node),
|
|
112
|
-
'startLine': node.lineno,
|
|
113
|
-
'endLine': node.end_lineno if node.end_lineno else node.lineno,
|
|
114
|
-
'async': isinstance(node, ast.AsyncFunctionDef),
|
|
115
|
-
'signature': get_signature(node),
|
|
116
|
-
'calls': extract_calls(node)
|
|
117
|
-
})
|
|
118
|
-
|
|
119
|
-
elif isinstance(node, ast.ClassDef):
|
|
120
|
-
methods = []
|
|
121
|
-
for item in node.body:
|
|
122
|
-
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
123
|
-
methods.append({
|
|
124
|
-
'name': item.name,
|
|
125
|
-
'async': isinstance(item, ast.AsyncFunctionDef),
|
|
126
|
-
'signature': get_signature(item),
|
|
127
|
-
'startLine': item.lineno,
|
|
128
|
-
'endLine': item.end_lineno if item.end_lineno else item.lineno,
|
|
129
|
-
'calls': extract_calls(item)
|
|
130
|
-
})
|
|
131
|
-
|
|
132
|
-
nodes.append({
|
|
133
|
-
'type': 'class',
|
|
134
|
-
'name': node.name,
|
|
135
|
-
'exported': is_exported(node),
|
|
136
|
-
'startLine': node.lineno,
|
|
137
|
-
'endLine': node.end_lineno if node.end_lineno else node.lineno,
|
|
138
|
-
'methods': methods
|
|
139
|
-
})
|
|
140
|
-
|
|
141
|
-
imports = extract_imports(tree)
|
|
142
|
-
|
|
143
|
-
return {
|
|
144
|
-
'nodes': nodes,
|
|
145
|
-
'imports': imports
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
except SyntaxError as e:
|
|
149
|
-
raise Exception(f"Python syntax error at line {e.lineno}: {e.msg}")
|
|
150
|
-
except Exception as e:
|
|
151
|
-
raise Exception(f"Failed to parse Python AST: {str(e)}")
|
|
152
|
-
|
|
153
|
-
async def process_request(crawler, request):
|
|
154
|
-
"""Process a single crawl request"""
|
|
155
|
-
try:
|
|
156
|
-
params = request.get('params', {})
|
|
157
|
-
url = params.get('url')
|
|
158
|
-
|
|
159
|
-
if not url:
|
|
160
|
-
raise ValueError('URL parameter is required')
|
|
161
|
-
|
|
162
|
-
# Perform async crawl
|
|
163
|
-
result = await crawler.arun(url=url)
|
|
164
|
-
|
|
165
|
-
if not result.success:
|
|
166
|
-
raise Exception(f"Crawl failed: {result.error_message}")
|
|
167
|
-
|
|
168
|
-
# Extract title from metadata (crawl4ai 0.7.8 stores title in metadata dict)
|
|
169
|
-
title = ''
|
|
170
|
-
if result.metadata and isinstance(result.metadata, dict):
|
|
171
|
-
title = result.metadata.get('title', '')
|
|
172
|
-
|
|
173
|
-
# Get markdown content (crawl4ai 0.7.8)
|
|
174
|
-
markdown = result.markdown or result.cleaned_html or ''
|
|
175
|
-
|
|
176
|
-
# Extract links - crawl4ai 0.7.8 returns dict with 'internal' and 'external' keys
|
|
177
|
-
# Each link is an object with href, text, title, etc. - extract just href strings
|
|
178
|
-
all_links = []
|
|
179
|
-
if isinstance(result.links, dict):
|
|
180
|
-
internal = result.links.get('internal', [])
|
|
181
|
-
external = result.links.get('external', [])
|
|
182
|
-
# Extract href from link objects (crawl4ai 0.7.8 returns objects, not strings)
|
|
183
|
-
for link in internal + external:
|
|
184
|
-
if isinstance(link, dict):
|
|
185
|
-
all_links.append(link.get('href', ''))
|
|
186
|
-
elif isinstance(link, str):
|
|
187
|
-
all_links.append(link)
|
|
188
|
-
|
|
189
|
-
response = {
|
|
190
|
-
'jsonrpc': '2.0',
|
|
191
|
-
'id': request.get('id'),
|
|
192
|
-
'result': {
|
|
193
|
-
'pages': [{
|
|
194
|
-
'url': url,
|
|
195
|
-
'title': title,
|
|
196
|
-
'content': markdown,
|
|
197
|
-
'html': result.html or '',
|
|
198
|
-
'links': all_links,
|
|
199
|
-
'crawledAt': '', # crawl4ai 0.7.8 doesn't provide timestamp
|
|
200
|
-
}]
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
print(json.dumps(response), flush=True)
|
|
204
|
-
except Exception as e:
|
|
205
|
-
error_response = {
|
|
206
|
-
'jsonrpc': '2.0',
|
|
207
|
-
'id': request.get('id') if isinstance(request, dict) else None,
|
|
208
|
-
'error': {'code': -1, 'message': str(e)}
|
|
209
|
-
}
|
|
210
|
-
print(json.dumps(error_response), flush=True)
|
|
211
|
-
|
|
212
|
-
async def main():
|
|
213
|
-
"""Main async loop processing stdin requests"""
|
|
214
|
-
# Disable verbose logging in crawl4ai
|
|
215
|
-
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
216
|
-
for line in sys.stdin:
|
|
217
|
-
try:
|
|
218
|
-
request = json.loads(line.strip())
|
|
219
|
-
method = request.get('method')
|
|
220
|
-
|
|
221
|
-
if method == 'crawl':
|
|
222
|
-
await process_request(crawler, request)
|
|
223
|
-
elif method == 'fetch_headless':
|
|
224
|
-
# Handle headless fetch request
|
|
225
|
-
try:
|
|
226
|
-
params = request.get('params', {})
|
|
227
|
-
url = params.get('url')
|
|
228
|
-
if not url:
|
|
229
|
-
raise ValueError('URL parameter is required')
|
|
230
|
-
|
|
231
|
-
result = await fetch_headless(url)
|
|
232
|
-
response = {
|
|
233
|
-
'jsonrpc': '2.0',
|
|
234
|
-
'id': request.get('id'),
|
|
235
|
-
'result': result
|
|
236
|
-
}
|
|
237
|
-
print(json.dumps(response), flush=True)
|
|
238
|
-
except Exception as e:
|
|
239
|
-
error_response = {
|
|
240
|
-
'jsonrpc': '2.0',
|
|
241
|
-
'id': request.get('id'),
|
|
242
|
-
'error': {'code': -1, 'message': str(e)}
|
|
243
|
-
}
|
|
244
|
-
print(json.dumps(error_response), flush=True)
|
|
245
|
-
|
|
246
|
-
elif method == 'parse_python':
|
|
247
|
-
# Handle Python AST parsing request
|
|
248
|
-
try:
|
|
249
|
-
params = request.get('params', {})
|
|
250
|
-
code = params.get('code')
|
|
251
|
-
file_path = params.get('filePath', '<unknown>')
|
|
252
|
-
|
|
253
|
-
if not code:
|
|
254
|
-
raise ValueError('code parameter is required')
|
|
255
|
-
|
|
256
|
-
result = await parse_python_ast(code, file_path)
|
|
257
|
-
response = {
|
|
258
|
-
'jsonrpc': '2.0',
|
|
259
|
-
'id': request.get('id'),
|
|
260
|
-
'result': result
|
|
261
|
-
}
|
|
262
|
-
print(json.dumps(response), flush=True)
|
|
263
|
-
except Exception as e:
|
|
264
|
-
error_response = {
|
|
265
|
-
'jsonrpc': '2.0',
|
|
266
|
-
'id': request.get('id'),
|
|
267
|
-
'error': {'code': -1, 'message': str(e)}
|
|
268
|
-
}
|
|
269
|
-
print(json.dumps(error_response), flush=True)
|
|
270
|
-
|
|
271
|
-
except Exception as e:
|
|
272
|
-
error_response = {
|
|
273
|
-
'jsonrpc': '2.0',
|
|
274
|
-
'id': request.get('id') if isinstance(request, dict) else None,
|
|
275
|
-
'error': {'code': -1, 'message': str(e)}
|
|
276
|
-
}
|
|
277
|
-
print(json.dumps(error_response), flush=True)
|
|
278
|
-
|
|
279
|
-
if __name__ == '__main__':
|
|
280
|
-
asyncio.run(main())
|
|
File without changes
|