@geravant/sinain 1.0.19 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +10 -1
  2. package/cli.js +176 -0
  3. package/install.js +11 -2
  4. package/launcher.js +622 -0
  5. package/openclaw.plugin.json +4 -0
  6. package/pack-prepare.js +48 -0
  7. package/package.json +24 -5
  8. package/sense_client/README.md +82 -0
  9. package/sense_client/__init__.py +1 -0
  10. package/sense_client/__main__.py +462 -0
  11. package/sense_client/app_detector.py +54 -0
  12. package/sense_client/app_detector_win.py +83 -0
  13. package/sense_client/capture.py +215 -0
  14. package/sense_client/capture_win.py +88 -0
  15. package/sense_client/change_detector.py +86 -0
  16. package/sense_client/config.py +64 -0
  17. package/sense_client/gate.py +145 -0
  18. package/sense_client/ocr.py +347 -0
  19. package/sense_client/privacy.py +65 -0
  20. package/sense_client/requirements.txt +13 -0
  21. package/sense_client/roi_extractor.py +84 -0
  22. package/sense_client/sender.py +173 -0
  23. package/sense_client/tests/__init__.py +0 -0
  24. package/sense_client/tests/test_stream1_optimizations.py +234 -0
  25. package/setup-overlay.js +82 -0
  26. package/sinain-agent/.env.example +17 -0
  27. package/sinain-agent/CLAUDE.md +80 -0
  28. package/sinain-agent/mcp-config.json +12 -0
  29. package/sinain-agent/run.sh +248 -0
  30. package/sinain-core/.env.example +93 -0
  31. package/sinain-core/package-lock.json +552 -0
  32. package/sinain-core/package.json +21 -0
  33. package/sinain-core/src/agent/analyzer.ts +366 -0
  34. package/sinain-core/src/agent/context-window.ts +172 -0
  35. package/sinain-core/src/agent/loop.ts +404 -0
  36. package/sinain-core/src/agent/situation-writer.ts +187 -0
  37. package/sinain-core/src/agent/traits.ts +520 -0
  38. package/sinain-core/src/audio/capture-spawner-macos.ts +44 -0
  39. package/sinain-core/src/audio/capture-spawner-win.ts +37 -0
  40. package/sinain-core/src/audio/capture-spawner.ts +14 -0
  41. package/sinain-core/src/audio/pipeline.ts +335 -0
  42. package/sinain-core/src/audio/transcription-local.ts +141 -0
  43. package/sinain-core/src/audio/transcription.ts +278 -0
  44. package/sinain-core/src/buffers/feed-buffer.ts +71 -0
  45. package/sinain-core/src/buffers/sense-buffer.ts +425 -0
  46. package/sinain-core/src/config.ts +245 -0
  47. package/sinain-core/src/escalation/escalation-slot.ts +136 -0
  48. package/sinain-core/src/escalation/escalator.ts +812 -0
  49. package/sinain-core/src/escalation/message-builder.ts +323 -0
  50. package/sinain-core/src/escalation/openclaw-ws.ts +726 -0
  51. package/sinain-core/src/escalation/scorer.ts +166 -0
  52. package/sinain-core/src/index.ts +507 -0
  53. package/sinain-core/src/learning/feedback-store.ts +253 -0
  54. package/sinain-core/src/learning/signal-collector.ts +218 -0
  55. package/sinain-core/src/log.ts +24 -0
  56. package/sinain-core/src/overlay/commands.ts +126 -0
  57. package/sinain-core/src/overlay/ws-handler.ts +267 -0
  58. package/sinain-core/src/privacy/index.ts +18 -0
  59. package/sinain-core/src/privacy/presets.ts +40 -0
  60. package/sinain-core/src/privacy/redact.ts +92 -0
  61. package/sinain-core/src/profiler.ts +181 -0
  62. package/sinain-core/src/recorder.ts +186 -0
  63. package/sinain-core/src/server.ts +417 -0
  64. package/sinain-core/src/trace/trace-store.ts +73 -0
  65. package/sinain-core/src/trace/tracer.ts +94 -0
  66. package/sinain-core/src/types.ts +427 -0
  67. package/sinain-core/src/util/dedup.ts +48 -0
  68. package/sinain-core/src/util/task-store.ts +84 -0
  69. package/sinain-core/tsconfig.json +18 -0
  70. package/sinain-knowledge/data/git-store.ts +2 -0
  71. package/sinain-mcp-server/index.ts +337 -0
  72. package/sinain-mcp-server/package.json +19 -0
  73. package/sinain-mcp-server/tsconfig.json +15 -0
@@ -0,0 +1,347 @@
1
+ """OCR backends for UI text extraction: macOS Vision, Windows.Media.Ocr, and Tesseract."""
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import re
6
+ import sys
7
+ from dataclasses import dataclass
8
+
9
+ from PIL import Image
10
+
11
+ try:
12
+ import pytesseract
13
+ except ImportError:
14
+ pytesseract = None
15
+
16
+
17
+ @dataclass
18
+ class OCRResult:
19
+ text: str
20
+ confidence: float
21
+ word_count: int
22
+
23
+
24
+ class LocalOCR:
25
+ """Tesseract OCR wrapper for UI text extraction."""
26
+
27
+ def __init__(self, lang: str = "eng", psm: int = 11,
28
+ min_confidence: int = 30, enabled: bool = True):
29
+ self.lang = lang
30
+ self.psm = psm
31
+ self.min_confidence = min_confidence
32
+ self.enabled = enabled
33
+
34
+ def extract(self, image: Image.Image) -> OCRResult:
35
+ """Returns extracted text with confidence."""
36
+ if not self.enabled or pytesseract is None:
37
+ return OCRResult(text="", confidence=0, word_count=0)
38
+
39
+ try:
40
+ data = pytesseract.image_to_data(
41
+ image,
42
+ lang=self.lang,
43
+ config=f"--psm {self.psm}",
44
+ output_type=pytesseract.Output.DICT,
45
+ )
46
+ except Exception as e:
47
+ print(f"[ocr] error: {e}", flush=True)
48
+ return OCRResult(text="", confidence=0, word_count=0)
49
+
50
+ words = []
51
+ confidences = []
52
+ for i, conf in enumerate(data["conf"]):
53
+ try:
54
+ c = int(conf)
55
+ except (ValueError, TypeError):
56
+ continue
57
+ if c >= self.min_confidence:
58
+ word = data["text"][i].strip()
59
+ if word:
60
+ words.append(word)
61
+ confidences.append(c)
62
+
63
+ text = " ".join(words)
64
+ text = self._clean(text)
65
+ avg_conf = sum(confidences) / len(confidences) if confidences else 0
66
+
67
+ return OCRResult(
68
+ text=text,
69
+ confidence=avg_conf,
70
+ word_count=len(words),
71
+ )
72
+
73
+ @staticmethod
74
+ def _clean(text: str) -> str:
75
+ """Strip control chars, collapse whitespace, remove noise lines."""
76
+ text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text)
77
+ text = re.sub(r"[ \t]+", " ", text)
78
+ lines = text.split("\n")
79
+ cleaned = []
80
+ for line in lines:
81
+ line = line.strip()
82
+ if line and re.search(r"[a-zA-Z0-9]", line):
83
+ cleaned.append(line)
84
+ return "\n".join(cleaned)
85
+
86
+
87
+ class VisionOCR:
88
+ """macOS Vision framework OCR using pyobjc."""
89
+
90
+ def __init__(self, languages: list[str] | None = None,
91
+ min_confidence: float = 0.5, enabled: bool = True):
92
+ self.languages = languages or ["en", "ru"]
93
+ self.min_confidence = min_confidence
94
+ self.enabled = enabled
95
+ self._available = False
96
+
97
+ if not enabled:
98
+ return
99
+
100
+ try:
101
+ import objc # noqa: F401
102
+ import Quartz # noqa: F401
103
+ from Foundation import NSURL, NSData # noqa: F401
104
+ objc.loadBundle('Vision', bundle_path='/System/Library/Frameworks/Vision.framework',
105
+ module_globals=globals())
106
+ self._available = True
107
+ except Exception as e:
108
+ print(f"[ocr] Vision framework unavailable: {e}", flush=True)
109
+
110
+ def extract(self, image: Image.Image) -> OCRResult:
111
+ """Returns extracted text using macOS Vision framework."""
112
+ if not self.enabled or not self._available:
113
+ return OCRResult(text="", confidence=0, word_count=0)
114
+
115
+ try:
116
+ return self._do_extract(image)
117
+ except Exception as e:
118
+ print(f"[ocr] Vision error: {e}", flush=True)
119
+ return OCRResult(text="", confidence=0, word_count=0)
120
+
121
+ def _do_extract(self, image: Image.Image) -> OCRResult:
122
+ import objc
123
+ import Vision
124
+ from Foundation import NSData
125
+ import Quartz
126
+
127
+ # Convert PIL Image to CGImage via PNG bytes
128
+ buf = io.BytesIO()
129
+ image.save(buf, format="PNG")
130
+ png_data = buf.getvalue()
131
+
132
+ ns_data = NSData.dataWithBytes_length_(png_data, len(png_data))
133
+ ci_image = Quartz.CIImage.imageWithData_(ns_data)
134
+ context = Quartz.CIContext.context()
135
+ cg_image = context.createCGImage_fromRect_(ci_image, ci_image.extent())
136
+
137
+ if cg_image is None:
138
+ return OCRResult(text="", confidence=0, word_count=0)
139
+
140
+ # Create and configure request
141
+ request = Vision.VNRecognizeTextRequest.alloc().init()
142
+ request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
143
+ request.setRecognitionLanguages_(self.languages)
144
+ request.setUsesLanguageCorrection_(True)
145
+
146
+ # Execute
147
+ handler = Vision.VNImageRequestHandler.alloc().initWithCGImage_options_(cg_image, None)
148
+ success = handler.performRequests_error_([request], objc.nil)
149
+ if not success[0]:
150
+ return OCRResult(text="", confidence=0, word_count=0)
151
+
152
+ results = request.results()
153
+ if not results:
154
+ return OCRResult(text="", confidence=0, word_count=0)
155
+
156
+ lines = []
157
+ confidences = []
158
+ word_count = 0
159
+
160
+ for observation in results:
161
+ candidate = observation.topCandidates_(1)
162
+ if not candidate:
163
+ continue
164
+ text = candidate[0].string()
165
+ conf = candidate[0].confidence()
166
+
167
+ if conf < self.min_confidence:
168
+ continue
169
+ if text and text.strip():
170
+ lines.append(text.strip())
171
+ confidences.append(conf)
172
+ word_count += len(text.split())
173
+
174
+ text = "\n".join(lines)
175
+ text = self._clean(text)
176
+ avg_conf = (sum(confidences) / len(confidences) * 100) if confidences else 0
177
+
178
+ return OCRResult(
179
+ text=text,
180
+ confidence=avg_conf,
181
+ word_count=word_count,
182
+ )
183
+
184
+ @staticmethod
185
+ def _clean(text: str) -> str:
186
+ """Collapse whitespace, remove noise lines."""
187
+ text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text)
188
+ lines = text.split("\n")
189
+ cleaned = []
190
+ for line in lines:
191
+ line = re.sub(r"[ \t]+", " ", line).strip()
192
+ if line and re.search(r"[a-zA-Z0-9а-яА-ЯёЁ]", line):
193
+ cleaned.append(line)
194
+ return "\n".join(cleaned)
195
+
196
+
197
+ class WinOCR:
198
+ """Windows.Media.Ocr backend via winrt-python (Windows 10+)."""
199
+
200
+ def __init__(self, language: str = "en", min_confidence: float = 0.5,
201
+ enabled: bool = True):
202
+ self.language = language
203
+ self.min_confidence = min_confidence
204
+ self.enabled = enabled
205
+ self._available = False
206
+ self._engine = None
207
+
208
+ if not enabled:
209
+ return
210
+
211
+ try:
212
+ from winrt.windows.media.ocr import OcrEngine
213
+ from winrt.windows.globalization import Language
214
+
215
+ lang = Language(language)
216
+ if OcrEngine.is_language_supported(lang):
217
+ self._engine = OcrEngine.try_create_from_language(lang)
218
+ self._available = self._engine is not None
219
+ else:
220
+ print(f"[ocr] WinOCR: language '{language}' not supported", flush=True)
221
+ except Exception as e:
222
+ print(f"[ocr] WinOCR unavailable: {e}", flush=True)
223
+
224
+ def extract(self, image: Image.Image) -> OCRResult:
225
+ """Returns extracted text using Windows.Media.Ocr."""
226
+ if not self.enabled or not self._available:
227
+ return OCRResult(text="", confidence=0, word_count=0)
228
+
229
+ try:
230
+ return self._do_extract(image)
231
+ except Exception as e:
232
+ print(f"[ocr] WinOCR error: {e}", flush=True)
233
+ return OCRResult(text="", confidence=0, word_count=0)
234
+
235
+ def _do_extract(self, image: Image.Image) -> OCRResult:
236
+ import asyncio
237
+ from winrt.windows.graphics.imaging import (
238
+ SoftwareBitmap, BitmapPixelFormat, BitmapAlphaMode,
239
+ )
240
+ from winrt.windows.storage.streams import (
241
+ InMemoryRandomAccessStream, DataWriter,
242
+ )
243
+
244
+ # Convert PIL to BMP bytes and load as SoftwareBitmap
245
+ buf = io.BytesIO()
246
+ image.convert("RGBA").save(buf, format="BMP")
247
+ bmp_bytes = buf.getvalue()
248
+
249
+ async def _run():
250
+ stream = InMemoryRandomAccessStream()
251
+ writer = DataWriter(stream)
252
+ writer.write_bytes(bmp_bytes)
253
+ await writer.store_async()
254
+ stream.seek(0)
255
+
256
+ from winrt.windows.graphics.imaging import BitmapDecoder
257
+ decoder = await BitmapDecoder.create_async(stream)
258
+ bitmap = await decoder.get_software_bitmap_async()
259
+
260
+ # Convert to supported pixel format if needed
261
+ if bitmap.bitmap_pixel_format != BitmapPixelFormat.BGRA8:
262
+ bitmap = SoftwareBitmap.convert(bitmap, BitmapPixelFormat.BGRA8,
263
+ BitmapAlphaMode.PREMULTIPLIED)
264
+
265
+ result = await self._engine.recognize_async(bitmap)
266
+ return result
267
+
268
+ loop = asyncio.new_event_loop()
269
+ try:
270
+ result = loop.run_until_complete(_run())
271
+ finally:
272
+ loop.close()
273
+
274
+ lines = []
275
+ word_count = 0
276
+ for line in result.lines:
277
+ text = line.text.strip()
278
+ if text:
279
+ lines.append(text)
280
+ word_count += len(text.split())
281
+
282
+ text = "\n".join(lines)
283
+ text = self._clean(text)
284
+
285
+ return OCRResult(text=text, confidence=80.0, word_count=word_count)
286
+
287
+ @staticmethod
288
+ def _clean(text: str) -> str:
289
+ text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text)
290
+ lines = text.split("\n")
291
+ cleaned = []
292
+ for line in lines:
293
+ line = re.sub(r"[ \t]+", " ", line).strip()
294
+ if line and re.search(r"[a-zA-Z0-9а-яА-ЯёЁ]", line):
295
+ cleaned.append(line)
296
+ return "\n".join(cleaned)
297
+
298
+
299
+ def create_ocr(config: dict):
300
+ """Factory: create the best available OCR backend based on config + platform.
301
+
302
+ config["ocr"] keys:
303
+ backend: "auto" | "vision" | "tesseract" | "winocr"
304
+ languages: list[str] (BCP-47 for Vision / WinOCR, e.g. ["en", "ru"])
305
+ lang: str (Tesseract lang code, e.g. "eng")
306
+ minConfidence: int (0-100 scale)
307
+ enabled: bool
308
+ """
309
+ ocr_cfg = config.get("ocr", {})
310
+ backend = ocr_cfg.get("backend", "auto")
311
+ enabled = ocr_cfg.get("enabled", True)
312
+
313
+ # macOS: try Vision framework
314
+ if sys.platform == "darwin" and backend in ("auto", "vision"):
315
+ vision = VisionOCR(
316
+ languages=ocr_cfg.get("languages", ["en", "ru"]),
317
+ min_confidence=ocr_cfg.get("minConfidence", 50) / 100.0,
318
+ enabled=enabled,
319
+ )
320
+ if vision._available:
321
+ print(f"[ocr] using Vision backend (languages={vision.languages})", flush=True)
322
+ return vision
323
+ if backend == "vision":
324
+ print("[ocr] Vision requested but unavailable, falling back to Tesseract", flush=True)
325
+
326
+ # Windows: try Windows.Media.Ocr
327
+ if sys.platform == "win32" and backend in ("auto", "winocr"):
328
+ languages = ocr_cfg.get("languages", ["en"])
329
+ winocr = WinOCR(
330
+ language=languages[0] if languages else "en",
331
+ min_confidence=ocr_cfg.get("minConfidence", 50) / 100.0,
332
+ enabled=enabled,
333
+ )
334
+ if winocr._available:
335
+ print(f"[ocr] using WinOCR backend (language={winocr.language})", flush=True)
336
+ return winocr
337
+ if backend == "winocr":
338
+ print("[ocr] WinOCR requested but unavailable, falling back to Tesseract", flush=True)
339
+
340
+ # Fallback to Tesseract (cross-platform)
341
+ print("[ocr] using Tesseract backend", flush=True)
342
+ return LocalOCR(
343
+ lang=ocr_cfg.get("lang", "eng"),
344
+ psm=ocr_cfg.get("psm", 11),
345
+ min_confidence=ocr_cfg.get("minConfidence", 50),
346
+ enabled=enabled,
347
+ )
@@ -0,0 +1,65 @@
1
+ """Privacy filter — strips <private> tags and auto-redacts sensitive patterns from OCR text."""
2
+
3
+ import re
4
+
5
+ # Patterns that auto-redact without manual tagging
6
+ _REDACT_PATTERNS: list[tuple[re.Pattern, str]] = [
7
+ # Credit card numbers (4 groups of 4 digits)
8
+ (re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"), "[REDACTED:card]"),
9
+ # API keys / tokens (long hex or base64 strings)
10
+ (re.compile(r"\b(?:sk-|pk-|api[_-]?key[=:]\s*)[A-Za-z0-9_\-]{20,}\b"), "[REDACTED:apikey]"),
11
+ # Bearer tokens
12
+ (re.compile(r"Bearer\s+[A-Za-z0-9_\-\.]{20,}"), "[REDACTED:bearer]"),
13
+ # AWS secret keys
14
+ (re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b"), "[REDACTED:awskey]"),
15
+ # Passwords in assignment context
16
+ (re.compile(r"(?:password|passwd|pwd)\s*[:=]\s*\S+", re.IGNORECASE), "[REDACTED:password]"),
17
+ # GitHub personal access tokens
18
+ (re.compile(r"\bghp_[A-Za-z0-9]{36}\b"), "[REDACTED:github_pat]"),
19
+ # GitHub server tokens
20
+ (re.compile(r"\bghs_[A-Za-z0-9]{36}\b"), "[REDACTED:github_srv]"),
21
+ # Slack tokens
22
+ (re.compile(r"\bxox[bpoa]-[0-9A-Za-z\-]+"), "[REDACTED:slack]"),
23
+ # Google OAuth tokens
24
+ (re.compile(r"\bya29\.[0-9A-Za-z\-_]+"), "[REDACTED:google_oauth]"),
25
+ # JWT tokens (three base64url segments)
26
+ (re.compile(r"\beyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+"), "[REDACTED:jwt]"),
27
+ # Generic secrets / keys in assignment context
28
+ (re.compile(r"(?:secret|token|key)\s*[:=]\s*[A-Za-z0-9_\-\.]{10,}", re.IGNORECASE), "[REDACTED:secret]"),
29
+ # Email addresses
30
+ (re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"), "[REDACTED:email]"),
31
+ # US phone numbers
32
+ (re.compile(r"\+?1?\s?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}\b"), "[REDACTED:phone]"),
33
+ # SSN (XXX-XX-XXXX)
34
+ (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED:ssn]"),
35
+ # CVV codes
36
+ (re.compile(r"\bCVV\s*[:=]?\s*\d{3,4}\b", re.IGNORECASE), "[REDACTED:cvv]"),
37
+ # PIN codes in assignment context
38
+ (re.compile(r"\bpin\s*[:=]\s*\d{4,8}\b", re.IGNORECASE), "[REDACTED:pin]"),
39
+ # Private key headers
40
+ (re.compile(r"-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----"), "[REDACTED:privkey]"),
41
+ # MRN (medical record numbers)
42
+ (re.compile(r"\bMRN\s*[:=]?\s*\d{6,10}\b", re.IGNORECASE), "[REDACTED:mrn]"),
43
+ ]
44
+
45
+ # Matches <private>...</private> blocks (including multiline)
46
+ _PRIVATE_TAG = re.compile(r"<private>.*?</private>", re.DOTALL)
47
+
48
+
49
+ def strip_private(text: str) -> str:
50
+ """Remove <private>...</private> blocks from text."""
51
+ return _PRIVATE_TAG.sub("", text).strip()
52
+
53
+
54
+ def redact_sensitive(text: str) -> str:
55
+ """Auto-redact patterns that look like secrets or PII."""
56
+ for pattern, replacement in _REDACT_PATTERNS:
57
+ text = pattern.sub(replacement, text)
58
+ return text
59
+
60
+
61
+ def apply_privacy(text: str) -> str:
62
+ """Full privacy pipeline: strip private tags, then auto-redact."""
63
+ text = strip_private(text)
64
+ text = redact_sensitive(text)
65
+ return text
@@ -0,0 +1,13 @@
1
+ pillow>=10.0
2
+ scikit-image>=0.22
3
+ numpy>=1.24
4
+ pytesseract>=0.3
5
+ requests>=2.31
6
+ mss>=9.0; sys_platform == "win32"
7
+ psutil>=5.9; sys_platform == "win32"
8
+ winrt-Windows.Media.Ocr>=2.0; sys_platform == "win32"
9
+ winrt-Windows.Globalization>=2.0; sys_platform == "win32"
10
+ winrt-Windows.Graphics.Imaging>=2.0; sys_platform == "win32"
11
+ winrt-Windows.Storage.Streams>=2.0; sys_platform == "win32"
12
+ winrt-Windows.Foundation>=2.0; sys_platform == "win32"
13
+ winrt-Windows.Foundation.Collections>=2.0; sys_platform == "win32"
@@ -0,0 +1,84 @@
1
+ """Region of Interest extraction from changed regions."""
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import numpy as np
6
+ from PIL import Image
7
+
8
+
9
+ @dataclass
10
+ class ROI:
11
+ image: Image.Image
12
+ bbox: tuple[int, int, int, int] # (x, y, w, h)
13
+
14
+
15
+ class ROIExtractor:
16
+ """Extracts and crops changed regions from a frame."""
17
+
18
+ def __init__(self, padding: int = 20, min_size: tuple[int, int] = (64, 64),
19
+ max_rois: int = 3):
20
+ self.padding = padding
21
+ self.min_size = min_size
22
+ self.max_rois = max_rois
23
+
24
+ def extract(self, frame: Image.Image, contours: list) -> list[ROI]:
25
+ """Returns list of ROI crops from frame based on contours."""
26
+ if not contours:
27
+ return []
28
+
29
+ # Compute bounding boxes for each contour
30
+ boxes = []
31
+ for coords in contours:
32
+ arr = np.array(coords)
33
+ min_y, min_x = arr.min(axis=0)
34
+ max_y, max_x = arr.max(axis=0)
35
+ boxes.append((int(min_x), int(min_y), int(max_x), int(max_y)))
36
+
37
+ # Merge overlapping/adjacent boxes
38
+ merged = self._merge_boxes(boxes)
39
+
40
+ # Add padding, clamp, crop
41
+ rois = []
42
+ w, h = frame.size
43
+ for x1, y1, x2, y2 in merged[:self.max_rois]:
44
+ x1 = max(0, x1 - self.padding)
45
+ y1 = max(0, y1 - self.padding)
46
+ x2 = min(w, x2 + self.padding)
47
+ y2 = min(h, y2 + self.padding)
48
+
49
+ roi_w = x2 - x1
50
+ roi_h = y2 - y1
51
+ if roi_w < self.min_size[0] or roi_h < self.min_size[1]:
52
+ continue
53
+
54
+ crop = frame.crop((x1, y1, x2, y2))
55
+ rois.append(ROI(image=crop, bbox=(x1, y1, roi_w, roi_h)))
56
+
57
+ return rois
58
+
59
+ def _merge_boxes(self, boxes: list[tuple]) -> list[tuple]:
60
+ """Merge overlapping or adjacent bounding boxes."""
61
+ if not boxes:
62
+ return []
63
+
64
+ # Sort by x1
65
+ boxes = sorted(boxes, key=lambda b: b[0])
66
+ merged = [list(boxes[0])]
67
+
68
+ for x1, y1, x2, y2 in boxes[1:]:
69
+ last = merged[-1]
70
+ # Check if boxes overlap or are within padding distance
71
+ if (x1 <= last[2] + self.padding and
72
+ y1 <= last[3] + self.padding and
73
+ y2 >= last[1] - self.padding):
74
+ # Merge
75
+ last[0] = min(last[0], x1)
76
+ last[1] = min(last[1], y1)
77
+ last[2] = max(last[2], x2)
78
+ last[3] = max(last[3], y2)
79
+ else:
80
+ merged.append([x1, y1, x2, y2])
81
+
82
+ # Sort by area (largest first)
83
+ merged.sort(key=lambda b: (b[2] - b[0]) * (b[3] - b[1]), reverse=True)
84
+ return [tuple(b) for b in merged]
@@ -0,0 +1,173 @@
1
+ """POST sense events to the relay server."""
2
+
3
+ import base64
4
+ import io
5
+ import time
6
+
7
+ import requests
8
+ from PIL import Image
9
+
10
+ from .gate import SenseEvent
11
+
12
+ # Retry config for /sense POST
13
+ _MAX_RETRIES = 3
14
+ _RETRY_BASE_DELAY_S = 1.0 # 1s, 2s, 4s (exponential)
15
+
16
+
17
+ class SenseSender:
18
+ """POSTs sense events to the relay server with retry and backoff."""
19
+
20
+ def __init__(self, url: str = "http://localhost:9500",
21
+ max_image_kb: int = 500, send_thumbnails: bool = True):
22
+ self.url = url.rstrip("/")
23
+ self.max_image_kb = max_image_kb
24
+ self.send_thumbnails = send_thumbnails
25
+ self._latencies: list[float] = []
26
+ self._last_stats_ts: float = time.time()
27
+ self._consecutive_failures: int = 0
28
+
29
+ def send(self, event: SenseEvent) -> bool:
30
+ """POST /sense with JSON payload. Returns True on success."""
31
+ payload = {
32
+ "type": event.type,
33
+ "ts": event.ts,
34
+ "ocr": event.ocr,
35
+ "meta": {
36
+ "ssim": event.meta.ssim,
37
+ "app": event.meta.app,
38
+ "windowTitle": event.meta.window_title,
39
+ "screen": event.meta.screen,
40
+ },
41
+ }
42
+ if event.roi:
43
+ payload["roi"] = event.roi
44
+ if event.diff:
45
+ payload["diff"] = event.diff
46
+ if event.observation and event.observation.title:
47
+ payload["observation"] = {
48
+ "title": event.observation.title,
49
+ "subtitle": event.observation.subtitle,
50
+ "facts": event.observation.facts,
51
+ "narrative": event.observation.narrative,
52
+ "concepts": event.observation.concepts,
53
+ }
54
+
55
+ for attempt in range(_MAX_RETRIES):
56
+ try:
57
+ start = time.time()
58
+ resp = requests.post(
59
+ f"{self.url}/sense",
60
+ json=payload,
61
+ timeout=5,
62
+ )
63
+ elapsed_ms = (time.time() - start) * 1000
64
+ self._latencies.append(elapsed_ms)
65
+ self._maybe_log_stats()
66
+
67
+ if resp.status_code == 200:
68
+ if self._consecutive_failures > 0:
69
+ print(f"[sender] reconnected after {self._consecutive_failures} failure(s)", flush=True)
70
+ self._consecutive_failures = 0
71
+ return True
72
+
73
+ # Non-200 but not an exception — log and retry
74
+ print(f"[sender] HTTP {resp.status_code} on attempt {attempt + 1}/{_MAX_RETRIES}", flush=True)
75
+
76
+ except requests.exceptions.ConnectionError as e:
77
+ print(f"[sender] connection error (attempt {attempt + 1}/{_MAX_RETRIES}): {e}", flush=True)
78
+ except requests.exceptions.Timeout:
79
+ print(f"[sender] timeout (attempt {attempt + 1}/{_MAX_RETRIES})", flush=True)
80
+ except Exception as e:
81
+ print(f"[sender] unexpected error (attempt {attempt + 1}/{_MAX_RETRIES}): {e}", flush=True)
82
+
83
+ # Don't sleep after the last attempt
84
+ if attempt < _MAX_RETRIES - 1:
85
+ delay = _RETRY_BASE_DELAY_S * (2 ** attempt)
86
+ print(f"[sender] retrying in {delay:.1f}s...", flush=True)
87
+ time.sleep(delay)
88
+
89
+ self._consecutive_failures += 1
90
+ if self._consecutive_failures == 1 or self._consecutive_failures % 10 == 0:
91
+ print(f"[sender] all {_MAX_RETRIES} attempts failed (consecutive failures: {self._consecutive_failures})", flush=True)
92
+ return False
93
+
94
+ def _maybe_log_stats(self):
95
+ """Log P50/P95 send latencies every 60s."""
96
+ now = time.time()
97
+ if now - self._last_stats_ts < 60:
98
+ return
99
+ if not self._latencies:
100
+ return
101
+ sorted_lat = sorted(self._latencies)
102
+ p50 = sorted_lat[len(sorted_lat) // 2]
103
+ p95 = sorted_lat[int(len(sorted_lat) * 0.95)]
104
+ print(f"[sender] relay latency: p50={p50:.0f}ms p95={p95:.0f}ms (n={len(sorted_lat)})", flush=True)
105
+ self._latencies.clear()
106
+ self._last_stats_ts = now
107
+
108
+
109
+ def encode_image(img: Image.Image, max_kb: int, max_px: int = 0) -> str:
110
+ """Encode PIL Image to base64 JPEG, reducing quality until under max_kb."""
111
+ if max_px:
112
+ ratio = max_px / max(img.size)
113
+ if ratio < 1:
114
+ img = img.resize(
115
+ (int(img.width * ratio), int(img.height * ratio)),
116
+ Image.LANCZOS,
117
+ )
118
+
119
+ if img.mode == "RGBA":
120
+ img = img.convert("RGB")
121
+
122
+ # Try high quality first — often fits
123
+ max_bytes = max_kb * 1024
124
+ buf = io.BytesIO()
125
+ img.save(buf, format="JPEG", quality=85)
126
+ if buf.tell() <= max_bytes:
127
+ return base64.b64encode(buf.getvalue()).decode()
128
+
129
+ # Binary search for the highest quality that fits
130
+ lo, hi = 20, 80
131
+ best_buf = None
132
+ while lo <= hi:
133
+ mid = (lo + hi) // 2
134
+ buf = io.BytesIO()
135
+ img.save(buf, format="JPEG", quality=mid)
136
+ if buf.tell() <= max_bytes:
137
+ best_buf = buf
138
+ lo = mid + 1
139
+ else:
140
+ hi = mid - 1
141
+
142
+ if best_buf is not None:
143
+ return base64.b64encode(best_buf.getvalue()).decode()
144
+
145
+ # Last resort: return at lowest quality
146
+ buf = io.BytesIO()
147
+ img.save(buf, format="JPEG", quality=20)
148
+ return base64.b64encode(buf.getvalue()).decode()
149
+
150
+
151
+ def package_full_frame(frame: Image.Image, max_px: int = 384) -> dict:
152
+ """Package a full frame as a small thumbnail for context events."""
153
+ return {
154
+ "data": encode_image(frame, max_kb=200, max_px=max_px),
155
+ "bbox": [0, 0, frame.width, frame.height],
156
+ "thumb": True,
157
+ }
158
+
159
+
160
+ def package_roi(roi, thumb: bool = True) -> dict:
161
+ """Package an ROI as a small thumbnail for text/visual events."""
162
+ return {
163
+ "data": encode_image(roi.image, max_kb=60, max_px=384),
164
+ "bbox": list(roi.bbox),
165
+ "thumb": True,
166
+ }
167
+
168
+
169
+ def package_diff(diff_image: Image.Image) -> dict:
170
+ """Package a diff image."""
171
+ return {
172
+ "data": encode_image(diff_image, max_kb=200),
173
+ }
File without changes