bluera-knowledge 0.18.2 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,280 +0,0 @@
1
- #!/usr/bin/env python3
2
- import sys
3
- import json
4
- import asyncio
5
- import os
6
- import ast
7
- from typing import List, Dict, Any
8
-
9
- # Suppress crawl4ai logging before import
10
- os.environ['CRAWL4AI_VERBOSE'] = '0'
11
-
12
- # Redirect stderr to suppress logging (crawl4ai uses console for progress)
13
- import io
14
- sys.stderr = io.StringIO()
15
-
16
- from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
17
-
18
- async def fetch_headless(url: str):
19
- """Fetch URL with headless browser (Playwright via crawl4ai)"""
20
- browser_config = BrowserConfig(headless=True, verbose=False)
21
- run_config = CrawlerRunConfig(
22
- wait_for="js:() => document.readyState === 'complete'",
23
- page_timeout=30000
24
- )
25
-
26
- async with AsyncWebCrawler(config=browser_config, verbose=False) as crawler:
27
- result = await crawler.arun(url, config=run_config)
28
-
29
- if not result.success:
30
- raise Exception(f"Crawl failed: {result.error_message}")
31
-
32
- # Combine internal and external links - let TypeScript filter by domain
33
- all_links = []
34
- if isinstance(result.links, dict):
35
- all_links = result.links.get("internal", []) + result.links.get("external", [])
36
-
37
- return {
38
- "html": result.html or '',
39
- "markdown": result.markdown or result.cleaned_html or '',
40
- "links": all_links
41
- }
42
-
43
- def is_exported(node: ast.AST) -> bool:
44
- """Check if a function or class is exported (Python doesn't have explicit exports, check if starts with '_')"""
45
- if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
46
- return not node.name.startswith('_')
47
- return False
48
-
49
- def get_signature(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
50
- """Extract function signature from AST node"""
51
- args_list = []
52
-
53
- for arg in node.args.args:
54
- arg_str = arg.arg
55
- if arg.annotation:
56
- arg_str += f': {ast.unparse(arg.annotation)}'
57
- args_list.append(arg_str)
58
-
59
- return_annotation = ''
60
- if node.returns:
61
- return_annotation = f' -> {ast.unparse(node.returns)}'
62
-
63
- return f"{node.name}({', '.join(args_list)}){return_annotation}"
64
-
65
- def extract_imports(tree: ast.AST) -> List[Dict[str, Any]]:
66
- """Extract import statements from AST"""
67
- imports = []
68
-
69
- for node in ast.walk(tree):
70
- if isinstance(node, ast.Import):
71
- for alias in node.names:
72
- imports.append({
73
- 'source': alias.name,
74
- 'imported': alias.asname if alias.asname else alias.name
75
- })
76
- elif isinstance(node, ast.ImportFrom):
77
- module = node.module if node.module else ''
78
- for alias in node.names:
79
- imports.append({
80
- 'source': module,
81
- 'imported': alias.name,
82
- 'alias': alias.asname if alias.asname else None
83
- })
84
-
85
- return imports
86
-
87
- def extract_calls(node: ast.AST) -> List[str]:
88
- """Extract function calls from a function/method body"""
89
- calls = []
90
-
91
- for child in ast.walk(node):
92
- if isinstance(child, ast.Call):
93
- if isinstance(child.func, ast.Name):
94
- calls.append(child.func.id)
95
- elif isinstance(child.func, ast.Attribute):
96
- calls.append(child.func.attr)
97
-
98
- return calls
99
-
100
- async def parse_python_ast(code: str, file_path: str) -> Dict[str, Any]:
101
- """Parse Python code and return CodeNode array"""
102
- try:
103
- tree = ast.parse(code)
104
- nodes = []
105
-
106
- for node in tree.body:
107
- if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
108
- nodes.append({
109
- 'type': 'function',
110
- 'name': node.name,
111
- 'exported': is_exported(node),
112
- 'startLine': node.lineno,
113
- 'endLine': node.end_lineno if node.end_lineno else node.lineno,
114
- 'async': isinstance(node, ast.AsyncFunctionDef),
115
- 'signature': get_signature(node),
116
- 'calls': extract_calls(node)
117
- })
118
-
119
- elif isinstance(node, ast.ClassDef):
120
- methods = []
121
- for item in node.body:
122
- if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
123
- methods.append({
124
- 'name': item.name,
125
- 'async': isinstance(item, ast.AsyncFunctionDef),
126
- 'signature': get_signature(item),
127
- 'startLine': item.lineno,
128
- 'endLine': item.end_lineno if item.end_lineno else item.lineno,
129
- 'calls': extract_calls(item)
130
- })
131
-
132
- nodes.append({
133
- 'type': 'class',
134
- 'name': node.name,
135
- 'exported': is_exported(node),
136
- 'startLine': node.lineno,
137
- 'endLine': node.end_lineno if node.end_lineno else node.lineno,
138
- 'methods': methods
139
- })
140
-
141
- imports = extract_imports(tree)
142
-
143
- return {
144
- 'nodes': nodes,
145
- 'imports': imports
146
- }
147
-
148
- except SyntaxError as e:
149
- raise Exception(f"Python syntax error at line {e.lineno}: {e.msg}")
150
- except Exception as e:
151
- raise Exception(f"Failed to parse Python AST: {str(e)}")
152
-
153
- async def process_request(crawler, request):
154
- """Process a single crawl request"""
155
- try:
156
- params = request.get('params', {})
157
- url = params.get('url')
158
-
159
- if not url:
160
- raise ValueError('URL parameter is required')
161
-
162
- # Perform async crawl
163
- result = await crawler.arun(url=url)
164
-
165
- if not result.success:
166
- raise Exception(f"Crawl failed: {result.error_message}")
167
-
168
- # Extract title from metadata (crawl4ai 0.7.8 stores title in metadata dict)
169
- title = ''
170
- if result.metadata and isinstance(result.metadata, dict):
171
- title = result.metadata.get('title', '')
172
-
173
- # Get markdown content (crawl4ai 0.7.8)
174
- markdown = result.markdown or result.cleaned_html or ''
175
-
176
- # Extract links - crawl4ai 0.7.8 returns dict with 'internal' and 'external' keys
177
- # Each link is an object with href, text, title, etc. - extract just href strings
178
- all_links = []
179
- if isinstance(result.links, dict):
180
- internal = result.links.get('internal', [])
181
- external = result.links.get('external', [])
182
- # Extract href from link objects (crawl4ai 0.7.8 returns objects, not strings)
183
- for link in internal + external:
184
- if isinstance(link, dict):
185
- all_links.append(link.get('href', ''))
186
- elif isinstance(link, str):
187
- all_links.append(link)
188
-
189
- response = {
190
- 'jsonrpc': '2.0',
191
- 'id': request.get('id'),
192
- 'result': {
193
- 'pages': [{
194
- 'url': url,
195
- 'title': title,
196
- 'content': markdown,
197
- 'html': result.html or '',
198
- 'links': all_links,
199
- 'crawledAt': '', # crawl4ai 0.7.8 doesn't provide timestamp
200
- }]
201
- }
202
- }
203
- print(json.dumps(response), flush=True)
204
- except Exception as e:
205
- error_response = {
206
- 'jsonrpc': '2.0',
207
- 'id': request.get('id') if isinstance(request, dict) else None,
208
- 'error': {'code': -1, 'message': str(e)}
209
- }
210
- print(json.dumps(error_response), flush=True)
211
-
212
- async def main():
213
- """Main async loop processing stdin requests"""
214
- # Disable verbose logging in crawl4ai
215
- async with AsyncWebCrawler(verbose=False) as crawler:
216
- for line in sys.stdin:
217
- try:
218
- request = json.loads(line.strip())
219
- method = request.get('method')
220
-
221
- if method == 'crawl':
222
- await process_request(crawler, request)
223
- elif method == 'fetch_headless':
224
- # Handle headless fetch request
225
- try:
226
- params = request.get('params', {})
227
- url = params.get('url')
228
- if not url:
229
- raise ValueError('URL parameter is required')
230
-
231
- result = await fetch_headless(url)
232
- response = {
233
- 'jsonrpc': '2.0',
234
- 'id': request.get('id'),
235
- 'result': result
236
- }
237
- print(json.dumps(response), flush=True)
238
- except Exception as e:
239
- error_response = {
240
- 'jsonrpc': '2.0',
241
- 'id': request.get('id'),
242
- 'error': {'code': -1, 'message': str(e)}
243
- }
244
- print(json.dumps(error_response), flush=True)
245
-
246
- elif method == 'parse_python':
247
- # Handle Python AST parsing request
248
- try:
249
- params = request.get('params', {})
250
- code = params.get('code')
251
- file_path = params.get('filePath', '<unknown>')
252
-
253
- if not code:
254
- raise ValueError('code parameter is required')
255
-
256
- result = await parse_python_ast(code, file_path)
257
- response = {
258
- 'jsonrpc': '2.0',
259
- 'id': request.get('id'),
260
- 'result': result
261
- }
262
- print(json.dumps(response), flush=True)
263
- except Exception as e:
264
- error_response = {
265
- 'jsonrpc': '2.0',
266
- 'id': request.get('id'),
267
- 'error': {'code': -1, 'message': str(e)}
268
- }
269
- print(json.dumps(error_response), flush=True)
270
-
271
- except Exception as e:
272
- error_response = {
273
- 'jsonrpc': '2.0',
274
- 'id': request.get('id') if isinstance(request, dict) else None,
275
- 'error': {'code': -1, 'message': str(e)}
276
- }
277
- print(json.dumps(error_response), flush=True)
278
-
279
- if __name__ == '__main__':
280
- asyncio.run(main())