@geravant/sinain 1.0.18 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -1
- package/cli.js +176 -0
- package/index.ts +163 -1257
- package/install.js +12 -2
- package/launcher.js +622 -0
- package/openclaw.plugin.json +4 -0
- package/pack-prepare.js +48 -0
- package/package.json +26 -5
- package/sense_client/README.md +82 -0
- package/sense_client/__init__.py +1 -0
- package/sense_client/__main__.py +462 -0
- package/sense_client/app_detector.py +54 -0
- package/sense_client/app_detector_win.py +83 -0
- package/sense_client/capture.py +215 -0
- package/sense_client/capture_win.py +88 -0
- package/sense_client/change_detector.py +86 -0
- package/sense_client/config.py +64 -0
- package/sense_client/gate.py +145 -0
- package/sense_client/ocr.py +347 -0
- package/sense_client/privacy.py +65 -0
- package/sense_client/requirements.txt +13 -0
- package/sense_client/roi_extractor.py +84 -0
- package/sense_client/sender.py +173 -0
- package/sense_client/tests/__init__.py +0 -0
- package/sense_client/tests/test_stream1_optimizations.py +234 -0
- package/setup-overlay.js +82 -0
- package/sinain-agent/.env.example +17 -0
- package/sinain-agent/CLAUDE.md +80 -0
- package/sinain-agent/mcp-config.json +12 -0
- package/sinain-agent/run.sh +248 -0
- package/sinain-core/.env.example +93 -0
- package/sinain-core/package-lock.json +552 -0
- package/sinain-core/package.json +21 -0
- package/sinain-core/src/agent/analyzer.ts +366 -0
- package/sinain-core/src/agent/context-window.ts +172 -0
- package/sinain-core/src/agent/loop.ts +404 -0
- package/sinain-core/src/agent/situation-writer.ts +187 -0
- package/sinain-core/src/agent/traits.ts +520 -0
- package/sinain-core/src/audio/capture-spawner-macos.ts +44 -0
- package/sinain-core/src/audio/capture-spawner-win.ts +37 -0
- package/sinain-core/src/audio/capture-spawner.ts +14 -0
- package/sinain-core/src/audio/pipeline.ts +335 -0
- package/sinain-core/src/audio/transcription-local.ts +141 -0
- package/sinain-core/src/audio/transcription.ts +278 -0
- package/sinain-core/src/buffers/feed-buffer.ts +71 -0
- package/sinain-core/src/buffers/sense-buffer.ts +425 -0
- package/sinain-core/src/config.ts +245 -0
- package/sinain-core/src/escalation/escalation-slot.ts +136 -0
- package/sinain-core/src/escalation/escalator.ts +812 -0
- package/sinain-core/src/escalation/message-builder.ts +323 -0
- package/sinain-core/src/escalation/openclaw-ws.ts +726 -0
- package/sinain-core/src/escalation/scorer.ts +166 -0
- package/sinain-core/src/index.ts +507 -0
- package/sinain-core/src/learning/feedback-store.ts +253 -0
- package/sinain-core/src/learning/signal-collector.ts +218 -0
- package/sinain-core/src/log.ts +24 -0
- package/sinain-core/src/overlay/commands.ts +126 -0
- package/sinain-core/src/overlay/ws-handler.ts +267 -0
- package/sinain-core/src/privacy/index.ts +18 -0
- package/sinain-core/src/privacy/presets.ts +40 -0
- package/sinain-core/src/privacy/redact.ts +92 -0
- package/sinain-core/src/profiler.ts +181 -0
- package/sinain-core/src/recorder.ts +186 -0
- package/sinain-core/src/server.ts +417 -0
- package/sinain-core/src/trace/trace-store.ts +73 -0
- package/sinain-core/src/trace/tracer.ts +94 -0
- package/sinain-core/src/types.ts +427 -0
- package/sinain-core/src/util/dedup.ts +48 -0
- package/sinain-core/src/util/task-store.ts +84 -0
- package/sinain-core/tsconfig.json +18 -0
- package/sinain-knowledge/adapters/generic/adapter.ts +103 -0
- package/sinain-knowledge/adapters/interface.ts +72 -0
- package/sinain-knowledge/adapters/openclaw/adapter.ts +223 -0
- package/sinain-knowledge/curation/engine.ts +493 -0
- package/sinain-knowledge/curation/resilience.ts +336 -0
- package/sinain-knowledge/data/git-store.ts +312 -0
- package/sinain-knowledge/data/schema.ts +89 -0
- package/sinain-knowledge/data/snapshot.ts +226 -0
- package/sinain-knowledge/data/store.ts +488 -0
- package/sinain-knowledge/deploy/cli.ts +214 -0
- package/sinain-knowledge/deploy/manifest.ts +80 -0
- package/sinain-knowledge/protocol/bindings/generic.md +5 -0
- package/sinain-knowledge/protocol/bindings/openclaw.md +5 -0
- package/sinain-knowledge/protocol/heartbeat.md +62 -0
- package/sinain-knowledge/protocol/renderer.ts +56 -0
- package/sinain-knowledge/protocol/skill.md +335 -0
- package/sinain-mcp-server/index.ts +337 -0
- package/sinain-mcp-server/package.json +19 -0
- package/sinain-mcp-server/tsconfig.json +15 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Decision gate — classifies sense events and decides what to send."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import difflib
|
|
5
|
+
import time
|
|
6
|
+
from collections import deque
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
from .change_detector import ChangeResult
|
|
10
|
+
from .ocr import OCRResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class SenseMeta:
|
|
15
|
+
ssim: float = 0.0
|
|
16
|
+
app: str = ""
|
|
17
|
+
window_title: str = ""
|
|
18
|
+
screen: int = 0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class SenseObservation:
|
|
23
|
+
"""Structured observation fields (claude-mem compatible schema).
|
|
24
|
+
|
|
25
|
+
Populated by sinain-core's agent layer, not by sense_client.
|
|
26
|
+
sense_client sets `title` and `facts` from OCR/app context;
|
|
27
|
+
sinain-core enriches with `narrative` and `concepts`.
|
|
28
|
+
"""
|
|
29
|
+
title: str = ""
|
|
30
|
+
subtitle: str = ""
|
|
31
|
+
facts: list[str] = field(default_factory=list)
|
|
32
|
+
narrative: str = ""
|
|
33
|
+
concepts: list[str] = field(default_factory=list)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class SenseEvent:
|
|
38
|
+
type: str # "text" | "visual" | "context"
|
|
39
|
+
ts: float = 0.0
|
|
40
|
+
ocr: str = ""
|
|
41
|
+
roi: dict | None = None
|
|
42
|
+
diff: dict | None = None
|
|
43
|
+
meta: SenseMeta = field(default_factory=SenseMeta)
|
|
44
|
+
observation: SenseObservation = field(default_factory=SenseObservation)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DecisionGate:
|
|
48
|
+
"""Classifies sense events and decides what to send."""
|
|
49
|
+
|
|
50
|
+
def __init__(self, min_ocr_chars: int = 20,
|
|
51
|
+
major_change_threshold: float = 0.85,
|
|
52
|
+
cooldown_ms: int = 5000,
|
|
53
|
+
adaptive_cooldown_ms: int = 2000,
|
|
54
|
+
context_cooldown_ms: int = 10000):
|
|
55
|
+
self.min_ocr_chars = min_ocr_chars
|
|
56
|
+
self.major_change_threshold = major_change_threshold
|
|
57
|
+
self.cooldown_ms = cooldown_ms
|
|
58
|
+
self.adaptive_cooldown_ms = adaptive_cooldown_ms
|
|
59
|
+
self.context_cooldown_ms = context_cooldown_ms
|
|
60
|
+
self.last_send_ts: float = 0
|
|
61
|
+
self.last_context_ts: float = 0
|
|
62
|
+
self.last_app_change_ts: float = 0
|
|
63
|
+
# Fuzzy dedup: ring buffer of last 5 OCR texts
|
|
64
|
+
self._recent_texts: deque[str] = deque(maxlen=5)
|
|
65
|
+
self._last_sent_text: str = ""
|
|
66
|
+
|
|
67
|
+
def is_ready(self, app_changed: bool, window_changed: bool) -> bool:
|
|
68
|
+
"""Time-based readiness check without consuming OCR output.
|
|
69
|
+
|
|
70
|
+
Used by backpressure scheduling to decide whether to run OCR at all.
|
|
71
|
+
"""
|
|
72
|
+
if app_changed or window_changed:
|
|
73
|
+
return True
|
|
74
|
+
now = time.time() * 1000
|
|
75
|
+
recent = (now - self.last_app_change_ts) < 10000
|
|
76
|
+
cooldown = self.adaptive_cooldown_ms if recent else self.cooldown_ms
|
|
77
|
+
return now - self.last_send_ts >= cooldown
|
|
78
|
+
|
|
79
|
+
def _is_duplicate(self, text: str) -> bool:
|
|
80
|
+
"""Check if text is too similar to any recently sent text."""
|
|
81
|
+
if text == self._last_sent_text:
|
|
82
|
+
return True
|
|
83
|
+
for prev in self._recent_texts:
|
|
84
|
+
ratio = difflib.SequenceMatcher(None, prev, text).ratio()
|
|
85
|
+
if ratio > 0.7:
|
|
86
|
+
return True
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _ocr_quality_ok(text: str) -> bool:
|
|
91
|
+
"""Reject garbage OCR: >50% single-char tokens or <50% alphanumeric."""
|
|
92
|
+
tokens = text.split()
|
|
93
|
+
if not tokens:
|
|
94
|
+
return False
|
|
95
|
+
single_char = sum(1 for t in tokens if len(t) == 1)
|
|
96
|
+
if single_char / len(tokens) > 0.5:
|
|
97
|
+
return False
|
|
98
|
+
alnum = sum(1 for ch in text if ch.isalnum())
|
|
99
|
+
total = len(text.replace(" ", ""))
|
|
100
|
+
if total > 0 and alnum / total < 0.5:
|
|
101
|
+
return False
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
def classify(self, change: ChangeResult | None,
|
|
105
|
+
ocr: OCRResult, app_changed: bool,
|
|
106
|
+
window_changed: bool = False) -> SenseEvent | None:
|
|
107
|
+
"""Returns SenseEvent to send, or None to drop."""
|
|
108
|
+
now = time.time() * 1000
|
|
109
|
+
|
|
110
|
+
# Context events (app/window change) bypass normal cooldown
|
|
111
|
+
if app_changed or window_changed:
|
|
112
|
+
self.last_app_change_ts = now
|
|
113
|
+
if now - self.last_context_ts >= self.context_cooldown_ms:
|
|
114
|
+
self.last_context_ts = now
|
|
115
|
+
self.last_send_ts = now
|
|
116
|
+
return SenseEvent(type="context", ts=now)
|
|
117
|
+
|
|
118
|
+
# Adaptive cooldown: 2s after recent app switch, 5s otherwise
|
|
119
|
+
recent_app_change = (now - self.last_app_change_ts) < 10000
|
|
120
|
+
effective_cooldown = self.adaptive_cooldown_ms if recent_app_change else self.cooldown_ms
|
|
121
|
+
if now - self.last_send_ts < effective_cooldown:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
if change is None:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
# OCR text sufficient -> text event
|
|
128
|
+
if ocr.text and len(ocr.text) >= self.min_ocr_chars:
|
|
129
|
+
if self._is_duplicate(ocr.text):
|
|
130
|
+
return None
|
|
131
|
+
if not self._ocr_quality_ok(ocr.text):
|
|
132
|
+
return None
|
|
133
|
+
self._recent_texts.append(ocr.text)
|
|
134
|
+
self._last_sent_text = ocr.text
|
|
135
|
+
self.last_send_ts = now
|
|
136
|
+
return SenseEvent(type="text", ts=now, ocr=ocr.text,
|
|
137
|
+
meta=SenseMeta(ssim=change.ssim_score))
|
|
138
|
+
|
|
139
|
+
# Major visual change -> visual event
|
|
140
|
+
if change.ssim_score < self.major_change_threshold:
|
|
141
|
+
self.last_send_ts = now
|
|
142
|
+
return SenseEvent(type="visual", ts=now, ocr=ocr.text,
|
|
143
|
+
meta=SenseMeta(ssim=change.ssim_score))
|
|
144
|
+
|
|
145
|
+
return None
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
"""OCR backends for UI text extraction: macOS Vision, Windows.Media.Ocr, and Tesseract."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import io
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from PIL import Image
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import pytesseract
|
|
13
|
+
except ImportError:
|
|
14
|
+
pytesseract = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class OCRResult:
|
|
19
|
+
text: str
|
|
20
|
+
confidence: float
|
|
21
|
+
word_count: int
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LocalOCR:
|
|
25
|
+
"""Tesseract OCR wrapper for UI text extraction."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, lang: str = "eng", psm: int = 11,
|
|
28
|
+
min_confidence: int = 30, enabled: bool = True):
|
|
29
|
+
self.lang = lang
|
|
30
|
+
self.psm = psm
|
|
31
|
+
self.min_confidence = min_confidence
|
|
32
|
+
self.enabled = enabled
|
|
33
|
+
|
|
34
|
+
def extract(self, image: Image.Image) -> OCRResult:
|
|
35
|
+
"""Returns extracted text with confidence."""
|
|
36
|
+
if not self.enabled or pytesseract is None:
|
|
37
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
data = pytesseract.image_to_data(
|
|
41
|
+
image,
|
|
42
|
+
lang=self.lang,
|
|
43
|
+
config=f"--psm {self.psm}",
|
|
44
|
+
output_type=pytesseract.Output.DICT,
|
|
45
|
+
)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(f"[ocr] error: {e}", flush=True)
|
|
48
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
49
|
+
|
|
50
|
+
words = []
|
|
51
|
+
confidences = []
|
|
52
|
+
for i, conf in enumerate(data["conf"]):
|
|
53
|
+
try:
|
|
54
|
+
c = int(conf)
|
|
55
|
+
except (ValueError, TypeError):
|
|
56
|
+
continue
|
|
57
|
+
if c >= self.min_confidence:
|
|
58
|
+
word = data["text"][i].strip()
|
|
59
|
+
if word:
|
|
60
|
+
words.append(word)
|
|
61
|
+
confidences.append(c)
|
|
62
|
+
|
|
63
|
+
text = " ".join(words)
|
|
64
|
+
text = self._clean(text)
|
|
65
|
+
avg_conf = sum(confidences) / len(confidences) if confidences else 0
|
|
66
|
+
|
|
67
|
+
return OCRResult(
|
|
68
|
+
text=text,
|
|
69
|
+
confidence=avg_conf,
|
|
70
|
+
word_count=len(words),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def _clean(text: str) -> str:
|
|
75
|
+
"""Strip control chars, collapse whitespace, remove noise lines."""
|
|
76
|
+
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text)
|
|
77
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
78
|
+
lines = text.split("\n")
|
|
79
|
+
cleaned = []
|
|
80
|
+
for line in lines:
|
|
81
|
+
line = line.strip()
|
|
82
|
+
if line and re.search(r"[a-zA-Z0-9]", line):
|
|
83
|
+
cleaned.append(line)
|
|
84
|
+
return "\n".join(cleaned)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class VisionOCR:
|
|
88
|
+
"""macOS Vision framework OCR using pyobjc."""
|
|
89
|
+
|
|
90
|
+
def __init__(self, languages: list[str] | None = None,
|
|
91
|
+
min_confidence: float = 0.5, enabled: bool = True):
|
|
92
|
+
self.languages = languages or ["en", "ru"]
|
|
93
|
+
self.min_confidence = min_confidence
|
|
94
|
+
self.enabled = enabled
|
|
95
|
+
self._available = False
|
|
96
|
+
|
|
97
|
+
if not enabled:
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
import objc # noqa: F401
|
|
102
|
+
import Quartz # noqa: F401
|
|
103
|
+
from Foundation import NSURL, NSData # noqa: F401
|
|
104
|
+
objc.loadBundle('Vision', bundle_path='/System/Library/Frameworks/Vision.framework',
|
|
105
|
+
module_globals=globals())
|
|
106
|
+
self._available = True
|
|
107
|
+
except Exception as e:
|
|
108
|
+
print(f"[ocr] Vision framework unavailable: {e}", flush=True)
|
|
109
|
+
|
|
110
|
+
def extract(self, image: Image.Image) -> OCRResult:
|
|
111
|
+
"""Returns extracted text using macOS Vision framework."""
|
|
112
|
+
if not self.enabled or not self._available:
|
|
113
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
return self._do_extract(image)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
print(f"[ocr] Vision error: {e}", flush=True)
|
|
119
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
120
|
+
|
|
121
|
+
def _do_extract(self, image: Image.Image) -> OCRResult:
|
|
122
|
+
import objc
|
|
123
|
+
import Vision
|
|
124
|
+
from Foundation import NSData
|
|
125
|
+
import Quartz
|
|
126
|
+
|
|
127
|
+
# Convert PIL Image to CGImage via PNG bytes
|
|
128
|
+
buf = io.BytesIO()
|
|
129
|
+
image.save(buf, format="PNG")
|
|
130
|
+
png_data = buf.getvalue()
|
|
131
|
+
|
|
132
|
+
ns_data = NSData.dataWithBytes_length_(png_data, len(png_data))
|
|
133
|
+
ci_image = Quartz.CIImage.imageWithData_(ns_data)
|
|
134
|
+
context = Quartz.CIContext.context()
|
|
135
|
+
cg_image = context.createCGImage_fromRect_(ci_image, ci_image.extent())
|
|
136
|
+
|
|
137
|
+
if cg_image is None:
|
|
138
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
139
|
+
|
|
140
|
+
# Create and configure request
|
|
141
|
+
request = Vision.VNRecognizeTextRequest.alloc().init()
|
|
142
|
+
request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
|
143
|
+
request.setRecognitionLanguages_(self.languages)
|
|
144
|
+
request.setUsesLanguageCorrection_(True)
|
|
145
|
+
|
|
146
|
+
# Execute
|
|
147
|
+
handler = Vision.VNImageRequestHandler.alloc().initWithCGImage_options_(cg_image, None)
|
|
148
|
+
success = handler.performRequests_error_([request], objc.nil)
|
|
149
|
+
if not success[0]:
|
|
150
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
151
|
+
|
|
152
|
+
results = request.results()
|
|
153
|
+
if not results:
|
|
154
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
155
|
+
|
|
156
|
+
lines = []
|
|
157
|
+
confidences = []
|
|
158
|
+
word_count = 0
|
|
159
|
+
|
|
160
|
+
for observation in results:
|
|
161
|
+
candidate = observation.topCandidates_(1)
|
|
162
|
+
if not candidate:
|
|
163
|
+
continue
|
|
164
|
+
text = candidate[0].string()
|
|
165
|
+
conf = candidate[0].confidence()
|
|
166
|
+
|
|
167
|
+
if conf < self.min_confidence:
|
|
168
|
+
continue
|
|
169
|
+
if text and text.strip():
|
|
170
|
+
lines.append(text.strip())
|
|
171
|
+
confidences.append(conf)
|
|
172
|
+
word_count += len(text.split())
|
|
173
|
+
|
|
174
|
+
text = "\n".join(lines)
|
|
175
|
+
text = self._clean(text)
|
|
176
|
+
avg_conf = (sum(confidences) / len(confidences) * 100) if confidences else 0
|
|
177
|
+
|
|
178
|
+
return OCRResult(
|
|
179
|
+
text=text,
|
|
180
|
+
confidence=avg_conf,
|
|
181
|
+
word_count=word_count,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
@staticmethod
|
|
185
|
+
def _clean(text: str) -> str:
|
|
186
|
+
"""Collapse whitespace, remove noise lines."""
|
|
187
|
+
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text)
|
|
188
|
+
lines = text.split("\n")
|
|
189
|
+
cleaned = []
|
|
190
|
+
for line in lines:
|
|
191
|
+
line = re.sub(r"[ \t]+", " ", line).strip()
|
|
192
|
+
if line and re.search(r"[a-zA-Z0-9а-яА-ЯёЁ]", line):
|
|
193
|
+
cleaned.append(line)
|
|
194
|
+
return "\n".join(cleaned)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class WinOCR:
|
|
198
|
+
"""Windows.Media.Ocr backend via winrt-python (Windows 10+)."""
|
|
199
|
+
|
|
200
|
+
def __init__(self, language: str = "en", min_confidence: float = 0.5,
|
|
201
|
+
enabled: bool = True):
|
|
202
|
+
self.language = language
|
|
203
|
+
self.min_confidence = min_confidence
|
|
204
|
+
self.enabled = enabled
|
|
205
|
+
self._available = False
|
|
206
|
+
self._engine = None
|
|
207
|
+
|
|
208
|
+
if not enabled:
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
from winrt.windows.media.ocr import OcrEngine
|
|
213
|
+
from winrt.windows.globalization import Language
|
|
214
|
+
|
|
215
|
+
lang = Language(language)
|
|
216
|
+
if OcrEngine.is_language_supported(lang):
|
|
217
|
+
self._engine = OcrEngine.try_create_from_language(lang)
|
|
218
|
+
self._available = self._engine is not None
|
|
219
|
+
else:
|
|
220
|
+
print(f"[ocr] WinOCR: language '{language}' not supported", flush=True)
|
|
221
|
+
except Exception as e:
|
|
222
|
+
print(f"[ocr] WinOCR unavailable: {e}", flush=True)
|
|
223
|
+
|
|
224
|
+
def extract(self, image: Image.Image) -> OCRResult:
|
|
225
|
+
"""Returns extracted text using Windows.Media.Ocr."""
|
|
226
|
+
if not self.enabled or not self._available:
|
|
227
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
return self._do_extract(image)
|
|
231
|
+
except Exception as e:
|
|
232
|
+
print(f"[ocr] WinOCR error: {e}", flush=True)
|
|
233
|
+
return OCRResult(text="", confidence=0, word_count=0)
|
|
234
|
+
|
|
235
|
+
def _do_extract(self, image: Image.Image) -> OCRResult:
|
|
236
|
+
import asyncio
|
|
237
|
+
from winrt.windows.graphics.imaging import (
|
|
238
|
+
SoftwareBitmap, BitmapPixelFormat, BitmapAlphaMode,
|
|
239
|
+
)
|
|
240
|
+
from winrt.windows.storage.streams import (
|
|
241
|
+
InMemoryRandomAccessStream, DataWriter,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Convert PIL to BMP bytes and load as SoftwareBitmap
|
|
245
|
+
buf = io.BytesIO()
|
|
246
|
+
image.convert("RGBA").save(buf, format="BMP")
|
|
247
|
+
bmp_bytes = buf.getvalue()
|
|
248
|
+
|
|
249
|
+
async def _run():
|
|
250
|
+
stream = InMemoryRandomAccessStream()
|
|
251
|
+
writer = DataWriter(stream)
|
|
252
|
+
writer.write_bytes(bmp_bytes)
|
|
253
|
+
await writer.store_async()
|
|
254
|
+
stream.seek(0)
|
|
255
|
+
|
|
256
|
+
from winrt.windows.graphics.imaging import BitmapDecoder
|
|
257
|
+
decoder = await BitmapDecoder.create_async(stream)
|
|
258
|
+
bitmap = await decoder.get_software_bitmap_async()
|
|
259
|
+
|
|
260
|
+
# Convert to supported pixel format if needed
|
|
261
|
+
if bitmap.bitmap_pixel_format != BitmapPixelFormat.BGRA8:
|
|
262
|
+
bitmap = SoftwareBitmap.convert(bitmap, BitmapPixelFormat.BGRA8,
|
|
263
|
+
BitmapAlphaMode.PREMULTIPLIED)
|
|
264
|
+
|
|
265
|
+
result = await self._engine.recognize_async(bitmap)
|
|
266
|
+
return result
|
|
267
|
+
|
|
268
|
+
loop = asyncio.new_event_loop()
|
|
269
|
+
try:
|
|
270
|
+
result = loop.run_until_complete(_run())
|
|
271
|
+
finally:
|
|
272
|
+
loop.close()
|
|
273
|
+
|
|
274
|
+
lines = []
|
|
275
|
+
word_count = 0
|
|
276
|
+
for line in result.lines:
|
|
277
|
+
text = line.text.strip()
|
|
278
|
+
if text:
|
|
279
|
+
lines.append(text)
|
|
280
|
+
word_count += len(text.split())
|
|
281
|
+
|
|
282
|
+
text = "\n".join(lines)
|
|
283
|
+
text = self._clean(text)
|
|
284
|
+
|
|
285
|
+
return OCRResult(text=text, confidence=80.0, word_count=word_count)
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def _clean(text: str) -> str:
|
|
289
|
+
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text)
|
|
290
|
+
lines = text.split("\n")
|
|
291
|
+
cleaned = []
|
|
292
|
+
for line in lines:
|
|
293
|
+
line = re.sub(r"[ \t]+", " ", line).strip()
|
|
294
|
+
if line and re.search(r"[a-zA-Z0-9а-яА-ЯёЁ]", line):
|
|
295
|
+
cleaned.append(line)
|
|
296
|
+
return "\n".join(cleaned)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def create_ocr(config: dict):
|
|
300
|
+
"""Factory: create the best available OCR backend based on config + platform.
|
|
301
|
+
|
|
302
|
+
config["ocr"] keys:
|
|
303
|
+
backend: "auto" | "vision" | "tesseract" | "winocr"
|
|
304
|
+
languages: list[str] (BCP-47 for Vision / WinOCR, e.g. ["en", "ru"])
|
|
305
|
+
lang: str (Tesseract lang code, e.g. "eng")
|
|
306
|
+
minConfidence: int (0-100 scale)
|
|
307
|
+
enabled: bool
|
|
308
|
+
"""
|
|
309
|
+
ocr_cfg = config.get("ocr", {})
|
|
310
|
+
backend = ocr_cfg.get("backend", "auto")
|
|
311
|
+
enabled = ocr_cfg.get("enabled", True)
|
|
312
|
+
|
|
313
|
+
# macOS: try Vision framework
|
|
314
|
+
if sys.platform == "darwin" and backend in ("auto", "vision"):
|
|
315
|
+
vision = VisionOCR(
|
|
316
|
+
languages=ocr_cfg.get("languages", ["en", "ru"]),
|
|
317
|
+
min_confidence=ocr_cfg.get("minConfidence", 50) / 100.0,
|
|
318
|
+
enabled=enabled,
|
|
319
|
+
)
|
|
320
|
+
if vision._available:
|
|
321
|
+
print(f"[ocr] using Vision backend (languages={vision.languages})", flush=True)
|
|
322
|
+
return vision
|
|
323
|
+
if backend == "vision":
|
|
324
|
+
print("[ocr] Vision requested but unavailable, falling back to Tesseract", flush=True)
|
|
325
|
+
|
|
326
|
+
# Windows: try Windows.Media.Ocr
|
|
327
|
+
if sys.platform == "win32" and backend in ("auto", "winocr"):
|
|
328
|
+
languages = ocr_cfg.get("languages", ["en"])
|
|
329
|
+
winocr = WinOCR(
|
|
330
|
+
language=languages[0] if languages else "en",
|
|
331
|
+
min_confidence=ocr_cfg.get("minConfidence", 50) / 100.0,
|
|
332
|
+
enabled=enabled,
|
|
333
|
+
)
|
|
334
|
+
if winocr._available:
|
|
335
|
+
print(f"[ocr] using WinOCR backend (language={winocr.language})", flush=True)
|
|
336
|
+
return winocr
|
|
337
|
+
if backend == "winocr":
|
|
338
|
+
print("[ocr] WinOCR requested but unavailable, falling back to Tesseract", flush=True)
|
|
339
|
+
|
|
340
|
+
# Fallback to Tesseract (cross-platform)
|
|
341
|
+
print("[ocr] using Tesseract backend", flush=True)
|
|
342
|
+
return LocalOCR(
|
|
343
|
+
lang=ocr_cfg.get("lang", "eng"),
|
|
344
|
+
psm=ocr_cfg.get("psm", 11),
|
|
345
|
+
min_confidence=ocr_cfg.get("minConfidence", 50),
|
|
346
|
+
enabled=enabled,
|
|
347
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Privacy filter — strips <private> tags and auto-redacts sensitive patterns from OCR text."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
# Patterns that auto-redact without manual tagging
|
|
6
|
+
_REDACT_PATTERNS: list[tuple[re.Pattern, str]] = [
|
|
7
|
+
# Credit card numbers (4 groups of 4 digits)
|
|
8
|
+
(re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"), "[REDACTED:card]"),
|
|
9
|
+
# API keys / tokens (long hex or base64 strings)
|
|
10
|
+
(re.compile(r"\b(?:sk-|pk-|api[_-]?key[=:]\s*)[A-Za-z0-9_\-]{20,}\b"), "[REDACTED:apikey]"),
|
|
11
|
+
# Bearer tokens
|
|
12
|
+
(re.compile(r"Bearer\s+[A-Za-z0-9_\-\.]{20,}"), "[REDACTED:bearer]"),
|
|
13
|
+
# AWS secret keys
|
|
14
|
+
(re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b"), "[REDACTED:awskey]"),
|
|
15
|
+
# Passwords in assignment context
|
|
16
|
+
(re.compile(r"(?:password|passwd|pwd)\s*[:=]\s*\S+", re.IGNORECASE), "[REDACTED:password]"),
|
|
17
|
+
# GitHub personal access tokens
|
|
18
|
+
(re.compile(r"\bghp_[A-Za-z0-9]{36}\b"), "[REDACTED:github_pat]"),
|
|
19
|
+
# GitHub server tokens
|
|
20
|
+
(re.compile(r"\bghs_[A-Za-z0-9]{36}\b"), "[REDACTED:github_srv]"),
|
|
21
|
+
# Slack tokens
|
|
22
|
+
(re.compile(r"\bxox[bpoa]-[0-9A-Za-z\-]+"), "[REDACTED:slack]"),
|
|
23
|
+
# Google OAuth tokens
|
|
24
|
+
(re.compile(r"\bya29\.[0-9A-Za-z\-_]+"), "[REDACTED:google_oauth]"),
|
|
25
|
+
# JWT tokens (three base64url segments)
|
|
26
|
+
(re.compile(r"\beyJ[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_]+"), "[REDACTED:jwt]"),
|
|
27
|
+
# Generic secrets / keys in assignment context
|
|
28
|
+
(re.compile(r"(?:secret|token|key)\s*[:=]\s*[A-Za-z0-9_\-\.]{10,}", re.IGNORECASE), "[REDACTED:secret]"),
|
|
29
|
+
# Email addresses
|
|
30
|
+
(re.compile(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b"), "[REDACTED:email]"),
|
|
31
|
+
# US phone numbers
|
|
32
|
+
(re.compile(r"\+?1?\s?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}\b"), "[REDACTED:phone]"),
|
|
33
|
+
# SSN (XXX-XX-XXXX)
|
|
34
|
+
(re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED:ssn]"),
|
|
35
|
+
# CVV codes
|
|
36
|
+
(re.compile(r"\bCVV\s*[:=]?\s*\d{3,4}\b", re.IGNORECASE), "[REDACTED:cvv]"),
|
|
37
|
+
# PIN codes in assignment context
|
|
38
|
+
(re.compile(r"\bpin\s*[:=]\s*\d{4,8}\b", re.IGNORECASE), "[REDACTED:pin]"),
|
|
39
|
+
# Private key headers
|
|
40
|
+
(re.compile(r"-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----"), "[REDACTED:privkey]"),
|
|
41
|
+
# MRN (medical record numbers)
|
|
42
|
+
(re.compile(r"\bMRN\s*[:=]?\s*\d{6,10}\b", re.IGNORECASE), "[REDACTED:mrn]"),
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
# Matches <private>...</private> blocks (including multiline)
|
|
46
|
+
_PRIVATE_TAG = re.compile(r"<private>.*?</private>", re.DOTALL)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def strip_private(text: str) -> str:
|
|
50
|
+
"""Remove <private>...</private> blocks from text."""
|
|
51
|
+
return _PRIVATE_TAG.sub("", text).strip()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def redact_sensitive(text: str) -> str:
|
|
55
|
+
"""Auto-redact patterns that look like secrets or PII."""
|
|
56
|
+
for pattern, replacement in _REDACT_PATTERNS:
|
|
57
|
+
text = pattern.sub(replacement, text)
|
|
58
|
+
return text
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def apply_privacy(text: str) -> str:
|
|
62
|
+
"""Full privacy pipeline: strip private tags, then auto-redact."""
|
|
63
|
+
text = strip_private(text)
|
|
64
|
+
text = redact_sensitive(text)
|
|
65
|
+
return text
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
pillow>=10.0
|
|
2
|
+
scikit-image>=0.22
|
|
3
|
+
numpy>=1.24
|
|
4
|
+
pytesseract>=0.3
|
|
5
|
+
requests>=2.31
|
|
6
|
+
mss>=9.0; sys_platform == "win32"
|
|
7
|
+
psutil>=5.9; sys_platform == "win32"
|
|
8
|
+
winrt-Windows.Media.Ocr>=2.0; sys_platform == "win32"
|
|
9
|
+
winrt-Windows.Globalization>=2.0; sys_platform == "win32"
|
|
10
|
+
winrt-Windows.Graphics.Imaging>=2.0; sys_platform == "win32"
|
|
11
|
+
winrt-Windows.Storage.Streams>=2.0; sys_platform == "win32"
|
|
12
|
+
winrt-Windows.Foundation>=2.0; sys_platform == "win32"
|
|
13
|
+
winrt-Windows.Foundation.Collections>=2.0; sys_platform == "win32"
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Region of Interest extraction from changed regions."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from PIL import Image
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class ROI:
|
|
11
|
+
image: Image.Image
|
|
12
|
+
bbox: tuple[int, int, int, int] # (x, y, w, h)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ROIExtractor:
|
|
16
|
+
"""Extracts and crops changed regions from a frame."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, padding: int = 20, min_size: tuple[int, int] = (64, 64),
|
|
19
|
+
max_rois: int = 3):
|
|
20
|
+
self.padding = padding
|
|
21
|
+
self.min_size = min_size
|
|
22
|
+
self.max_rois = max_rois
|
|
23
|
+
|
|
24
|
+
def extract(self, frame: Image.Image, contours: list) -> list[ROI]:
|
|
25
|
+
"""Returns list of ROI crops from frame based on contours."""
|
|
26
|
+
if not contours:
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
# Compute bounding boxes for each contour
|
|
30
|
+
boxes = []
|
|
31
|
+
for coords in contours:
|
|
32
|
+
arr = np.array(coords)
|
|
33
|
+
min_y, min_x = arr.min(axis=0)
|
|
34
|
+
max_y, max_x = arr.max(axis=0)
|
|
35
|
+
boxes.append((int(min_x), int(min_y), int(max_x), int(max_y)))
|
|
36
|
+
|
|
37
|
+
# Merge overlapping/adjacent boxes
|
|
38
|
+
merged = self._merge_boxes(boxes)
|
|
39
|
+
|
|
40
|
+
# Add padding, clamp, crop
|
|
41
|
+
rois = []
|
|
42
|
+
w, h = frame.size
|
|
43
|
+
for x1, y1, x2, y2 in merged[:self.max_rois]:
|
|
44
|
+
x1 = max(0, x1 - self.padding)
|
|
45
|
+
y1 = max(0, y1 - self.padding)
|
|
46
|
+
x2 = min(w, x2 + self.padding)
|
|
47
|
+
y2 = min(h, y2 + self.padding)
|
|
48
|
+
|
|
49
|
+
roi_w = x2 - x1
|
|
50
|
+
roi_h = y2 - y1
|
|
51
|
+
if roi_w < self.min_size[0] or roi_h < self.min_size[1]:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
crop = frame.crop((x1, y1, x2, y2))
|
|
55
|
+
rois.append(ROI(image=crop, bbox=(x1, y1, roi_w, roi_h)))
|
|
56
|
+
|
|
57
|
+
return rois
|
|
58
|
+
|
|
59
|
+
def _merge_boxes(self, boxes: list[tuple]) -> list[tuple]:
|
|
60
|
+
"""Merge overlapping or adjacent bounding boxes."""
|
|
61
|
+
if not boxes:
|
|
62
|
+
return []
|
|
63
|
+
|
|
64
|
+
# Sort by x1
|
|
65
|
+
boxes = sorted(boxes, key=lambda b: b[0])
|
|
66
|
+
merged = [list(boxes[0])]
|
|
67
|
+
|
|
68
|
+
for x1, y1, x2, y2 in boxes[1:]:
|
|
69
|
+
last = merged[-1]
|
|
70
|
+
# Check if boxes overlap or are within padding distance
|
|
71
|
+
if (x1 <= last[2] + self.padding and
|
|
72
|
+
y1 <= last[3] + self.padding and
|
|
73
|
+
y2 >= last[1] - self.padding):
|
|
74
|
+
# Merge
|
|
75
|
+
last[0] = min(last[0], x1)
|
|
76
|
+
last[1] = min(last[1], y1)
|
|
77
|
+
last[2] = max(last[2], x2)
|
|
78
|
+
last[3] = max(last[3], y2)
|
|
79
|
+
else:
|
|
80
|
+
merged.append([x1, y1, x2, y2])
|
|
81
|
+
|
|
82
|
+
# Sort by area (largest first)
|
|
83
|
+
merged.sort(key=lambda b: (b[2] - b[0]) * (b[3] - b[1]), reverse=True)
|
|
84
|
+
return [tuple(b) for b in merged]
|