@masteryhub-its/speakout-local-client-model 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +418 -130
- package/lib/index.d.ts +15 -1
- package/lib/index.js +82 -38
- package/lib/model.js +5 -15
- package/lib/tokenizer.d.ts +1 -1
- package/lib/tokenizer.js +35 -40
- package/lib/utils/constants.d.ts +0 -1
- package/lib/utils/constants.js +15 -39
- package/models/bert-mini-moderation-output/tokenizer.json +5 -13
- package/package.json +10 -5
package/README.md
CHANGED
|
@@ -1,47 +1,115 @@
|
|
|
1
1
|
# @masteryhub-its/speakout-local-client-model
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://www.npmjs.com/package/@masteryhub-its/speakout-local-client-model)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://www.typescriptlang.org/)
|
|
4
6
|
|
|
5
|
-
|
|
7
|
+
> **Professional-grade Arabic text moderation for browser environments**
|
|
8
|
+
> Powered by BERT with ONNX Runtime Web and WebAssembly for blazing-fast, client-side inference.
|
|
6
9
|
|
|
7
|
-
|
|
8
|
-
- 🌐 **Browser-Ready**: Designed for browser environments using WebAssembly
|
|
9
|
-
- 📦 **Zero Config**: Works out of the box with embedded model files - no manual setup required
|
|
10
|
-
- 🔒 **Type Safe**: Full TypeScript support with type definitions included
|
|
11
|
-
- ⚡ **Efficient**: Minimal dependencies and optimized WASM performance
|
|
12
|
-
- 🔧 **Fully Typed**: Written entirely in TypeScript for better developer experience
|
|
10
|
+
---
|
|
13
11
|
|
|
14
|
-
##
|
|
12
|
+
## 🎯 Overview
|
|
13
|
+
|
|
14
|
+
A production-ready TypeScript library for Arabic text content moderation that runs entirely in the browser. Built on a fine-tuned BERT model (`asafaya/bert-mini-arabic`) with INT8 quantization for optimal performance, this package provides real-time content filtering without server dependencies.
|
|
15
|
+
|
|
16
|
+
### Key Features
|
|
17
|
+
|
|
18
|
+
- **🚀 High Performance** - INT8 quantized ONNX model with WebAssembly acceleration
|
|
19
|
+
- **🌐 Client-Side** - Zero backend dependencies, complete privacy
|
|
20
|
+
- **📦 Zero Configuration** - Embedded models, works out of the box
|
|
21
|
+
- **🔒 Type-Safe** - Full TypeScript support with comprehensive type definitions
|
|
22
|
+
- **⚡ Optimized** - Max pooling aggregation for accurate multi-chunk analysis
|
|
23
|
+
- **🎯 Production-Ready** - Battle-tested moderation logic with safety-first design
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## 📦 Installation
|
|
15
28
|
|
|
16
29
|
```bash
|
|
17
30
|
npm install @masteryhub-its/speakout-local-client-model
|
|
18
31
|
```
|
|
19
32
|
|
|
20
|
-
|
|
33
|
+
### Requirements
|
|
34
|
+
|
|
35
|
+
- **Node.js**: ≥ 18.0.0
|
|
36
|
+
- **Browser**: Modern browser with WebAssembly support
|
|
37
|
+
- **TypeScript** (optional): ≥ 5.3.3
|
|
38
|
+
|
|
39
|
+
---
|
|
21
40
|
|
|
22
|
-
|
|
41
|
+
## 🚀 Quick Start
|
|
42
|
+
|
|
43
|
+
### Basic Usage
|
|
23
44
|
|
|
24
45
|
```typescript
|
|
25
46
|
import { ClientContentModeration } from '@masteryhub-its/speakout-local-client-model';
|
|
26
47
|
|
|
27
|
-
|
|
48
|
+
// Initialize the moderation client
|
|
49
|
+
const moderator = new ClientContentModeration();
|
|
50
|
+
await moderator.initialize();
|
|
28
51
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
const result = await moderation.moderate("User input text");
|
|
52
|
+
// Moderate content
|
|
53
|
+
const result = await moderator.moderate('نص للمراجعة');
|
|
32
54
|
|
|
33
55
|
if (result.approved) {
|
|
34
|
-
|
|
56
|
+
console.log('✅ Content approved');
|
|
35
57
|
} else {
|
|
36
|
-
|
|
58
|
+
console.log('❌ Content rejected');
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
console.log(`Confidence: ${(result.confidence * 100).toFixed(1)}%`);
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### React Integration
|
|
65
|
+
|
|
66
|
+
```typescript
|
|
67
|
+
import { useEffect, useState } from 'react';
|
|
68
|
+
import { ClientContentModeration } from '@masteryhub-its/speakout-local-client-model';
|
|
69
|
+
|
|
70
|
+
function useModerator() {
|
|
71
|
+
const [moderator, setModerator] = useState<ClientContentModeration | null>(null);
|
|
72
|
+
const [loading, setLoading] = useState(true);
|
|
73
|
+
|
|
74
|
+
useEffect(() => {
|
|
75
|
+
const init = async () => {
|
|
76
|
+
const mod = new ClientContentModeration();
|
|
77
|
+
await mod.initialize();
|
|
78
|
+
setModerator(mod);
|
|
79
|
+
setLoading(false);
|
|
80
|
+
};
|
|
81
|
+
init();
|
|
82
|
+
}, []);
|
|
83
|
+
|
|
84
|
+
return { moderator, loading };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function CommentForm() {
|
|
88
|
+
const { moderator, loading } = useModerator();
|
|
89
|
+
|
|
90
|
+
const handleSubmit = async (text: string) => {
|
|
91
|
+
if (!moderator) return;
|
|
92
|
+
|
|
93
|
+
const result = await moderator.moderate(text);
|
|
94
|
+
if (!result.approved) {
|
|
95
|
+
alert('Content violates community guidelines');
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Submit approved content
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
// ... rest of component
|
|
37
103
|
}
|
|
38
104
|
```
|
|
39
105
|
|
|
40
|
-
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## 🔧 Configuration
|
|
41
109
|
|
|
42
|
-
|
|
110
|
+
### Vite Setup
|
|
43
111
|
|
|
44
|
-
|
|
112
|
+
Add WASM and ONNX support to your `vite.config.ts`:
|
|
45
113
|
|
|
46
114
|
```typescript
|
|
47
115
|
import { defineConfig } from 'vite';
|
|
@@ -55,37 +123,40 @@ export default defineConfig({
|
|
|
55
123
|
},
|
|
56
124
|
server: {
|
|
57
125
|
fs: {
|
|
58
|
-
// Allow serving
|
|
59
|
-
allow: ['..'],
|
|
126
|
+
allow: ['..'], // Allow serving from node_modules
|
|
60
127
|
},
|
|
61
128
|
},
|
|
62
129
|
});
|
|
63
130
|
```
|
|
64
131
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
```
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
132
|
+
### Webpack Configuration
|
|
133
|
+
|
|
134
|
+
```javascript
|
|
135
|
+
module.exports = {
|
|
136
|
+
module: {
|
|
137
|
+
rules: [
|
|
138
|
+
{
|
|
139
|
+
test: /\.onnx$/,
|
|
140
|
+
type: 'asset/resource',
|
|
141
|
+
},
|
|
142
|
+
],
|
|
143
|
+
},
|
|
144
|
+
resolve: {
|
|
145
|
+
fallback: {
|
|
146
|
+
fs: false,
|
|
147
|
+
path: false,
|
|
148
|
+
},
|
|
149
|
+
},
|
|
150
|
+
};
|
|
80
151
|
```
|
|
81
152
|
|
|
82
|
-
|
|
153
|
+
---
|
|
83
154
|
|
|
84
|
-
## API
|
|
155
|
+
## 📚 API Reference
|
|
85
156
|
|
|
86
157
|
### `ClientContentModeration`
|
|
87
158
|
|
|
88
|
-
Main class for
|
|
159
|
+
Main class for content moderation.
|
|
89
160
|
|
|
90
161
|
#### Constructor
|
|
91
162
|
|
|
@@ -93,165 +164,382 @@ Main class for text moderation.
|
|
|
93
164
|
new ClientContentModeration(options?: ModerationOptions)
|
|
94
165
|
```
|
|
95
166
|
|
|
96
|
-
|
|
97
|
-
- `modelFilePath?: string` - Custom URL to ONNX model file (e.g., "/models/model.onnx")
|
|
98
|
-
- `tokenizerFilePath?: string` - Custom URL to tokenizer file (e.g., "/models/tokenizer.json")
|
|
99
|
-
- `maxLength?: number` - Maximum sequence length (default: 128)
|
|
100
|
-
- `threshold?: number` - Confidence threshold (default: 0.5)
|
|
167
|
+
Currently uses default configuration with embedded models.
|
|
101
168
|
|
|
102
169
|
#### Methods
|
|
103
170
|
|
|
104
|
-
##### `initialize(
|
|
171
|
+
##### `initialize(): Promise<void>`
|
|
105
172
|
|
|
106
|
-
|
|
173
|
+
Initializes the ONNX model and tokenizer. Called automatically on first use, but can be called explicitly for better error handling.
|
|
107
174
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
175
|
+
```typescript
|
|
176
|
+
const moderator = new ClientContentModeration();
|
|
177
|
+
await moderator.initialize(); // Explicit initialization
|
|
178
|
+
```
|
|
111
179
|
|
|
112
180
|
##### `moderate(text: string, threshold?: number): Promise<ModerationResult>`
|
|
113
181
|
|
|
114
|
-
|
|
182
|
+
Moderates a single text string.
|
|
183
|
+
|
|
184
|
+
**Parameters:**
|
|
185
|
+
- `text` (string): Text to moderate
|
|
186
|
+
- `threshold` (number, optional): Approval threshold (0-1), default: 0.5
|
|
187
|
+
|
|
188
|
+
**Returns:** `ModerationResult`
|
|
115
189
|
|
|
116
|
-
**Returns:**
|
|
117
190
|
```typescript
|
|
118
|
-
{
|
|
119
|
-
approved: boolean;
|
|
120
|
-
confidence: number;
|
|
191
|
+
interface ModerationResult {
|
|
192
|
+
approved: boolean; // Whether content passes moderation
|
|
193
|
+
confidence: number; // Confidence score (0-1)
|
|
121
194
|
probabilities: {
|
|
122
|
-
reject: number;
|
|
123
|
-
approve: number;
|
|
124
|
-
}
|
|
195
|
+
reject: number; // Rejection probability (0-1)
|
|
196
|
+
approve: number; // Approval probability (0-1)
|
|
197
|
+
};
|
|
125
198
|
}
|
|
126
199
|
```
|
|
127
200
|
|
|
201
|
+
**Example:**
|
|
202
|
+
|
|
203
|
+
```typescript
|
|
204
|
+
const result = await moderator.moderate('نص للمراجعة', 0.7);
|
|
205
|
+
console.log(result);
|
|
206
|
+
// {
|
|
207
|
+
// approved: true,
|
|
208
|
+
// confidence: 0.85,
|
|
209
|
+
// probabilities: { reject: 0.15, approve: 0.85 }
|
|
210
|
+
// }
|
|
211
|
+
```
|
|
212
|
+
|
|
128
213
|
##### `moderateBatch(texts: string[], threshold?: number): Promise<ModerationResult[]>`
|
|
129
214
|
|
|
130
|
-
|
|
215
|
+
Moderates multiple texts in parallel for better performance.
|
|
131
216
|
|
|
132
|
-
|
|
217
|
+
```typescript
|
|
218
|
+
const texts = ['نص أول', 'نص ثاني', 'نص ثالث'];
|
|
219
|
+
const results = await moderator.moderateBatch(texts);
|
|
133
220
|
|
|
134
|
-
|
|
221
|
+
results.forEach((result, i) => {
|
|
222
|
+
console.log(`Text ${i + 1}: ${result.approved ? '✅' : '❌'}`);
|
|
223
|
+
});
|
|
224
|
+
```
|
|
135
225
|
|
|
136
|
-
|
|
226
|
+
##### `dispose(): void`
|
|
137
227
|
|
|
138
|
-
|
|
228
|
+
Releases resources and cleans up the ONNX session. Call when done using the moderator.
|
|
139
229
|
|
|
140
230
|
```typescript
|
|
141
|
-
|
|
231
|
+
moderator.dispose();
|
|
232
|
+
```
|
|
142
233
|
|
|
143
|
-
|
|
144
|
-
await moderation.initialize();
|
|
234
|
+
---
|
|
145
235
|
|
|
146
|
-
|
|
147
|
-
console.log(`Approved: ${result.approved}, Confidence: ${result.confidence}`);
|
|
148
|
-
```
|
|
236
|
+
## 💡 Advanced Usage
|
|
149
237
|
|
|
150
238
|
### Custom Threshold
|
|
151
239
|
|
|
240
|
+
Adjust sensitivity based on your use case:
|
|
241
|
+
|
|
152
242
|
```typescript
|
|
153
|
-
|
|
243
|
+
// Strict moderation (fewer false positives)
|
|
244
|
+
const strict = await moderator.moderate(text, 0.8);
|
|
245
|
+
|
|
246
|
+
// Lenient moderation (fewer false negatives)
|
|
247
|
+
const lenient = await moderator.moderate(text, 0.3);
|
|
248
|
+
|
|
249
|
+
// Balanced (default)
|
|
250
|
+
const balanced = await moderator.moderate(text, 0.5);
|
|
154
251
|
```
|
|
155
252
|
|
|
156
|
-
###
|
|
253
|
+
### Error Handling
|
|
157
254
|
|
|
158
255
|
```typescript
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
}
|
|
256
|
+
try {
|
|
257
|
+
const moderator = new ClientContentModeration();
|
|
258
|
+
await moderator.initialize();
|
|
259
|
+
|
|
260
|
+
const result = await moderator.moderate(userInput);
|
|
261
|
+
|
|
262
|
+
if (!result.approved) {
|
|
263
|
+
// Handle rejected content
|
|
264
|
+
console.warn('Content flagged:', result.probabilities);
|
|
265
|
+
}
|
|
266
|
+
} catch (error) {
|
|
267
|
+
console.error('Moderation failed:', error);
|
|
268
|
+
// Fallback: allow content or use server-side moderation
|
|
269
|
+
}
|
|
169
270
|
```
|
|
170
271
|
|
|
171
|
-
###
|
|
272
|
+
### Performance Optimization
|
|
172
273
|
|
|
173
274
|
```typescript
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
275
|
+
// Initialize once, reuse for all requests
|
|
276
|
+
const moderator = new ClientContentModeration();
|
|
277
|
+
await moderator.initialize(); // ~100-200ms initial load
|
|
278
|
+
|
|
279
|
+
// Subsequent calls are fast (~10-50ms per text)
|
|
280
|
+
const result1 = await moderator.moderate(text1);
|
|
281
|
+
const result2 = await moderator.moderate(text2);
|
|
282
|
+
|
|
283
|
+
// Batch processing for multiple texts
|
|
284
|
+
const results = await moderator.moderateBatch([text1, text2, text3]);
|
|
285
|
+
|
|
286
|
+
// Clean up when done
|
|
287
|
+
moderator.dispose();
|
|
180
288
|
```
|
|
181
289
|
|
|
182
|
-
|
|
290
|
+
---
|
|
291
|
+
|
|
292
|
+
## 🏗️ Architecture
|
|
293
|
+
|
|
294
|
+
### Model Details
|
|
295
|
+
|
|
296
|
+
- **Base Model**: `asafaya/bert-mini-arabic`
|
|
297
|
+
- **Task**: Binary sequence classification (approve/reject)
|
|
298
|
+
- **Quantization**: INT8 for 4x smaller size and faster inference
|
|
299
|
+
- **Max Sequence Length**: 128 tokens
|
|
300
|
+
- **Tokenizer**: WordPiece with Unicode normalization
|
|
301
|
+
|
|
302
|
+
### Processing Pipeline
|
|
303
|
+
|
|
304
|
+
1. **Tokenization** - Text → BERT tokens with proper punctuation handling
|
|
305
|
+
2. **Chunking** - Long texts split into 128-token chunks
|
|
306
|
+
3. **Inference** - ONNX Runtime processes each chunk
|
|
307
|
+
4. **Aggregation** - Max pooling on rejection probability (safety-first)
|
|
308
|
+
5. **Decision** - Threshold-based approval/rejection
|
|
309
|
+
|
|
310
|
+
### Safety-First Design
|
|
311
|
+
|
|
312
|
+
The library uses **max pooling** on rejection probabilities rather than averaging. This means:
|
|
313
|
+
- ✅ A single toxic chunk in long text → rejection
|
|
314
|
+
- ✅ Prevents dilution of toxic signals
|
|
315
|
+
- ✅ Better safety for user-generated content
|
|
316
|
+
|
|
317
|
+
---
|
|
318
|
+
|
|
319
|
+
## 📊 Performance
|
|
320
|
+
|
|
321
|
+
| Metric | Value |
|
|
322
|
+
|--------|-------|
|
|
323
|
+
| Model Size | ~12 MB (INT8 quantized) |
|
|
324
|
+
| Initial Load | ~100-200ms |
|
|
325
|
+
| Inference (per text) | ~10-50ms |
|
|
326
|
+
| Memory Usage | ~50-100 MB |
|
|
327
|
+
| Browser Support | Chrome 91+, Firefox 89+, Safari 15+ |
|
|
328
|
+
|
|
329
|
+
---
|
|
330
|
+
|
|
331
|
+
## 🛠️ Development
|
|
183
332
|
|
|
184
333
|
### Building from Source
|
|
185
334
|
|
|
186
335
|
```bash
|
|
187
|
-
# Clone
|
|
336
|
+
# Clone repository
|
|
188
337
|
git clone <repository-url>
|
|
189
338
|
cd speakout-platform-local-model
|
|
190
339
|
|
|
191
340
|
# Install dependencies
|
|
192
341
|
npm install
|
|
193
342
|
|
|
194
|
-
# Build
|
|
343
|
+
# Build TypeScript
|
|
195
344
|
npm run build
|
|
196
345
|
|
|
197
|
-
#
|
|
198
|
-
npm run
|
|
346
|
+
# Format code
|
|
347
|
+
npm run format
|
|
348
|
+
|
|
349
|
+
# Format Python (if contributing to training scripts)
|
|
350
|
+
npm run format:py
|
|
199
351
|
```
|
|
200
352
|
|
|
201
353
|
### Project Structure
|
|
202
354
|
|
|
203
355
|
```
|
|
204
|
-
├── src/
|
|
205
|
-
│ ├── index.ts
|
|
206
|
-
│ ├── model.ts
|
|
207
|
-
│ ├── tokenizer.ts
|
|
208
|
-
│ ├── types.ts
|
|
209
|
-
│ └── utils/
|
|
210
|
-
|
|
211
|
-
├──
|
|
212
|
-
├──
|
|
213
|
-
└──
|
|
356
|
+
├── src/ # TypeScript source
|
|
357
|
+
│ ├── index.ts # Main entry point
|
|
358
|
+
│ ├── model.ts # ONNX model wrapper
|
|
359
|
+
│ ├── tokenizer.ts # BERT tokenizer
|
|
360
|
+
│ ├── types.ts # Type definitions
|
|
361
|
+
│ └── utils/
|
|
362
|
+
│ └── constants.ts # Configuration constants
|
|
363
|
+
├── lib/ # Compiled JavaScript (generated)
|
|
364
|
+
├── models/ # ONNX model and tokenizer
|
|
365
|
+
│ └── bert-mini-moderation-output/
|
|
366
|
+
│ ├── model.int8.onnx
|
|
367
|
+
│ └── tokenizer.json
|
|
368
|
+
├── src/training/ # Python training scripts (not published)
|
|
369
|
+
├── src/data_processing/ # Data pipeline (not published)
|
|
370
|
+
└── tests/ # Test files
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### TypeScript Types
|
|
374
|
+
|
|
375
|
+
All types are exported for your convenience:
|
|
376
|
+
|
|
377
|
+
```typescript
|
|
378
|
+
import type {
|
|
379
|
+
ModerationResult,
|
|
380
|
+
ModerationOptions,
|
|
381
|
+
TokenizerEncoding,
|
|
382
|
+
TokenizerVocab,
|
|
383
|
+
InferenceSession,
|
|
384
|
+
} from '@masteryhub-its/speakout-local-client-model';
|
|
214
385
|
```
|
|
215
386
|
|
|
216
|
-
|
|
387
|
+
---
|
|
217
388
|
|
|
218
|
-
|
|
219
|
-
- For browser usage: Modern browser with WebAssembly support
|
|
220
|
-
- TypeScript >= 5.3.3 (for development)
|
|
389
|
+
## 🔒 Privacy & Security
|
|
221
390
|
|
|
222
|
-
|
|
391
|
+
- **100% Client-Side** - No data sent to external servers
|
|
392
|
+
- **No Telemetry** - Zero tracking or analytics
|
|
393
|
+
- **Offline Capable** - Works without internet after initial load
|
|
394
|
+
- **GDPR Compliant** - No personal data collection
|
|
223
395
|
|
|
224
|
-
|
|
396
|
+
---
|
|
225
397
|
|
|
226
|
-
|
|
227
|
-
- `tokenizers` - Fast tokenization library
|
|
398
|
+
## 🤝 Contributing
|
|
228
399
|
|
|
229
|
-
|
|
400
|
+
We welcome contributions from the community! Whether you're fixing bugs, adding features, or improving documentation, your help is appreciated.
|
|
401
|
+
|
|
402
|
+
### Ways to Contribute
|
|
230
403
|
|
|
231
|
-
-
|
|
232
|
-
-
|
|
233
|
-
-
|
|
404
|
+
- 🐛 **Report Bugs** - Open an issue with detailed reproduction steps
|
|
405
|
+
- 💡 **Suggest Features** - Share your ideas for improvements
|
|
406
|
+
- 📝 **Improve Documentation** - Help make our docs better
|
|
407
|
+
- 🔧 **Submit Code** - Fix bugs or implement new features
|
|
408
|
+
- 🧪 **Write Tests** - Improve test coverage
|
|
409
|
+
- 🌍 **Translate** - Help with internationalization
|
|
234
410
|
|
|
235
|
-
|
|
411
|
+
### Development Setup
|
|
236
412
|
|
|
237
|
-
|
|
413
|
+
1. **Fork & Clone**
|
|
414
|
+
```bash
|
|
415
|
+
git clone https://github.com/your-username/speakout-platform-local-model.git
|
|
416
|
+
cd speakout-platform-local-model
|
|
417
|
+
```
|
|
238
418
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
419
|
+
2. **Install Dependencies**
|
|
420
|
+
```bash
|
|
421
|
+
npm install
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
3. **Make Changes**
|
|
425
|
+
- Create a feature branch: `git checkout -b feature/your-feature-name`
|
|
426
|
+
- Write your code following our style guide
|
|
427
|
+
- Add tests if applicable
|
|
428
|
+
|
|
429
|
+
4. **Test Your Changes**
|
|
430
|
+
```bash
|
|
431
|
+
npm run build # Ensure it builds
|
|
432
|
+
npm run format # Format TypeScript/JavaScript
|
|
433
|
+
npm run format:py # Format Python (if applicable)
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
5. **Commit & Push**
|
|
437
|
+
```bash
|
|
438
|
+
git add .
|
|
439
|
+
git commit -m "feat: add your feature description"
|
|
440
|
+
git push origin feature/your-feature-name
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
6. **Open Pull Request**
|
|
444
|
+
- Go to the repository on GitHub
|
|
445
|
+
- Click "New Pull Request"
|
|
446
|
+
- Describe your changes clearly
|
|
447
|
+
- Link any related issues
|
|
448
|
+
|
|
449
|
+
### Code Style Guidelines
|
|
450
|
+
|
|
451
|
+
- **TypeScript**: Follow existing patterns, use proper types
|
|
452
|
+
- **Python**: Follow PEP 8, use Black formatter
|
|
453
|
+
- **Commits**: Use [Conventional Commits](https://www.conventionalcommits.org/)
|
|
454
|
+
- `feat:` - New features
|
|
455
|
+
- `fix:` - Bug fixes
|
|
456
|
+
- `docs:` - Documentation changes
|
|
457
|
+
- `refactor:` - Code refactoring
|
|
458
|
+
- `test:` - Adding tests
|
|
459
|
+
- `chore:` - Maintenance tasks
|
|
460
|
+
|
|
461
|
+
### Pull Request Guidelines
|
|
462
|
+
|
|
463
|
+
- ✅ Keep PRs focused on a single feature/fix
|
|
464
|
+
- ✅ Update documentation if needed
|
|
465
|
+
- ✅ Add tests for new functionality
|
|
466
|
+
- ✅ Ensure all checks pass
|
|
467
|
+
- ✅ Respond to review feedback promptly
|
|
468
|
+
|
|
469
|
+
### Code of Conduct
|
|
470
|
+
|
|
471
|
+
We are committed to providing a welcoming and inclusive environment. Please:
|
|
472
|
+
- Be respectful and considerate
|
|
473
|
+
- Accept constructive criticism gracefully
|
|
474
|
+
- Focus on what's best for the community
|
|
475
|
+
- Show empathy towards others
|
|
476
|
+
|
|
477
|
+
---
|
|
478
|
+
|
|
479
|
+
## 📄 License
|
|
480
|
+
|
|
481
|
+
MIT License
|
|
482
|
+
|
|
483
|
+
Copyright (c) 2024-2026 MasteryHub ITS
|
|
484
|
+
|
|
485
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
486
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
487
|
+
in the Software without restriction, including without limitation the rights
|
|
488
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
489
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
490
|
+
furnished to do so, subject to the following conditions:
|
|
491
|
+
|
|
492
|
+
The above copyright notice and this permission notice shall be included in all
|
|
493
|
+
copies or substantial portions of the Software.
|
|
494
|
+
|
|
495
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
496
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
497
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
498
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
499
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
500
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
501
|
+
SOFTWARE.
|
|
502
|
+
|
|
503
|
+
### Third-Party Licenses
|
|
504
|
+
|
|
505
|
+
This project uses the following open-source libraries:
|
|
506
|
+
|
|
507
|
+
- **ONNX Runtime Web** - [MIT License](https://github.com/microsoft/onnxruntime/blob/main/LICENSE)
|
|
508
|
+
- **BERT Model (asafaya/bert-mini-arabic)** - [Apache 2.0 License](https://huggingface.co/asafaya/bert-mini-arabic)
|
|
509
|
+
|
|
510
|
+
### Copyright Notice
|
|
511
|
+
|
|
512
|
+
All original code and documentation:
|
|
513
|
+
- Copyright © 2024-2026 MasteryHub ITS
|
|
514
|
+
- Licensed under MIT License
|
|
515
|
+
|
|
516
|
+
Model files and training data:
|
|
517
|
+
- Based on `asafaya/bert-mini-arabic` (Apache 2.0)
|
|
518
|
+
- Fine-tuned by MasteryHub ITS
|
|
519
|
+
- Distributed under Apache 2.0 License
|
|
520
|
+
|
|
521
|
+
---
|
|
522
|
+
|
|
523
|
+
## 🙏 Acknowledgments
|
|
524
|
+
|
|
525
|
+
- **BERT Model**: [asafaya/bert-mini-arabic](https://huggingface.co/asafaya/bert-mini-arabic)
|
|
526
|
+
- **ONNX Runtime**: [Microsoft ONNX Runtime Web](https://github.com/microsoft/onnxruntime)
|
|
527
|
+
- **Transformers**: [Hugging Face Transformers](https://github.com/huggingface/transformers)
|
|
528
|
+
|
|
529
|
+
---
|
|
530
|
+
|
|
531
|
+
## 📞 Support
|
|
246
532
|
|
|
247
|
-
|
|
533
|
+
- **Issues**: [GitHub Issues](https://github.com/your-org/speakout-platform-local-model/issues)
|
|
534
|
+
- **Discussions**: [GitHub Discussions](https://github.com/your-org/speakout-platform-local-model/discussions)
|
|
535
|
+
- **Email**: support@masteryhub-its.com
|
|
248
536
|
|
|
249
|
-
|
|
537
|
+
---
|
|
250
538
|
|
|
251
|
-
|
|
539
|
+
<div align="center">
|
|
252
540
|
|
|
253
|
-
|
|
541
|
+
**Made with ❤️ by MasteryHub ITS**
|
|
254
542
|
|
|
255
|
-
|
|
543
|
+
[Website](https://masteryhub-its.com) • [Documentation](https://docs.masteryhub-its.com) • [npm](https://www.npmjs.com/package/@masteryhub-its/speakout-local-client-model)
|
|
256
544
|
|
|
257
|
-
|
|
545
|
+
</div>
|
package/lib/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { ModerationResult, ModerationOptions } from
|
|
1
|
+
import { ModerationResult, ModerationOptions } from './types.js';
|
|
2
2
|
export declare class ClientContentModeration {
|
|
3
3
|
private model;
|
|
4
4
|
private tokenizer;
|
|
@@ -13,6 +13,20 @@ export declare class ClientContentModeration {
|
|
|
13
13
|
* @returns Array of probabilities (sums to 1)
|
|
14
14
|
*/
|
|
15
15
|
private softmax;
|
|
16
|
+
/**
|
|
17
|
+
* Find maximum value in array (for numerical stability in softmax)
|
|
18
|
+
*/
|
|
19
|
+
private findMax;
|
|
20
|
+
/**
|
|
21
|
+
* Compute exponentials and return their sum
|
|
22
|
+
* Mutates output array for performance
|
|
23
|
+
*/
|
|
24
|
+
private computeExponentials;
|
|
25
|
+
/**
|
|
26
|
+
* Normalize output to sum to 1 (convert to probabilities)
|
|
27
|
+
* Mutates output array for performance
|
|
28
|
+
*/
|
|
29
|
+
private normalizeProbabilities;
|
|
16
30
|
/**
|
|
17
31
|
* Dispose resources and clean up
|
|
18
32
|
*/
|
package/lib/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { ModerationModel } from
|
|
2
|
-
import { Tokenizer } from
|
|
3
|
-
import { DEFAULTS } from
|
|
1
|
+
import { ModerationModel } from './model.js';
|
|
2
|
+
import { Tokenizer } from './tokenizer.js';
|
|
3
|
+
import { DEFAULTS } from './utils/constants.js';
|
|
4
4
|
export class ClientContentModeration {
|
|
5
5
|
model;
|
|
6
6
|
tokenizer;
|
|
@@ -12,41 +12,48 @@ export class ClientContentModeration {
|
|
|
12
12
|
async initialize() {
|
|
13
13
|
if (this.initialized)
|
|
14
14
|
return;
|
|
15
|
-
await Promise.all([
|
|
16
|
-
this.model.initialize(),
|
|
17
|
-
this.tokenizer.initialize(),
|
|
18
|
-
]);
|
|
15
|
+
await Promise.all([this.model.initialize(), this.tokenizer.initialize()]);
|
|
19
16
|
this.initialized = true;
|
|
20
17
|
}
|
|
21
18
|
async moderate(text, threshold = DEFAULTS.THRESHOLD) {
|
|
22
19
|
if (!this.initialized)
|
|
23
20
|
await this.initialize();
|
|
24
21
|
const encodings = await this.tokenizer.encodeChunks(text);
|
|
25
|
-
const chunkResults = await Promise.all(encodings.map(e => this.model.predict(e.inputIds, e.attentionMask)));
|
|
26
|
-
const validChunks = chunkResults.filter(c => (Array.isArray(c) || c instanceof Float32Array) && c.length > 0);
|
|
22
|
+
const chunkResults = await Promise.all(encodings.map((e) => this.model.predict(e.inputIds, e.attentionMask)));
|
|
23
|
+
const validChunks = chunkResults.filter((c) => (Array.isArray(c) || c instanceof Float32Array) && c.length > 0);
|
|
27
24
|
if (!validChunks.length) {
|
|
28
25
|
return {
|
|
29
26
|
approved: true,
|
|
30
27
|
confidence: 0.5,
|
|
31
|
-
probabilities: { reject: 0.5, approve: 0.5 }
|
|
28
|
+
probabilities: { reject: 0.5, approve: 0.5 },
|
|
32
29
|
};
|
|
33
30
|
}
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
31
|
+
// Aggregation Logic: Max Pooling for Safety (Reject Probability)
|
|
32
|
+
// Instead of averaging logits (which can dilute toxic bursts in long text),
|
|
33
|
+
// we compute probabilities for EACH chunk and take the MAXIMUM Rejection probability.
|
|
34
|
+
// 1. Compute probabilities for each chunk
|
|
35
|
+
const chunkProbabilities = validChunks.map((chunkLogits) => this.softmax(chunkLogits));
|
|
36
|
+
// 2. Extract Reject probabilities (index 0) and Approve probabilities (index 1)
|
|
37
|
+
const rejectProbs = chunkProbabilities.map((p) => p[0]);
|
|
38
|
+
// const approveProbs = chunkProbabilities.map(p => p[1]);
|
|
39
|
+
// 3. Max Pooling on Reject Probability (Safety First)
|
|
40
|
+
const maxRejectProb = Math.max(...rejectProbs);
|
|
41
|
+
const finalApproveProb = 1 - maxRejectProb;
|
|
42
|
+
// Determine final decision based on the WORST chunk
|
|
43
|
+
const shouldApprove = finalApproveProb >= threshold; // Effectively: maxReject <= (1-threshold)
|
|
37
44
|
return {
|
|
38
45
|
approved: shouldApprove,
|
|
39
|
-
confidence:
|
|
46
|
+
confidence: maxRejectProb > finalApproveProb ? maxRejectProb : finalApproveProb,
|
|
40
47
|
probabilities: {
|
|
41
|
-
reject:
|
|
42
|
-
approve:
|
|
43
|
-
}
|
|
48
|
+
reject: maxRejectProb,
|
|
49
|
+
approve: finalApproveProb,
|
|
50
|
+
},
|
|
44
51
|
};
|
|
45
52
|
}
|
|
46
53
|
async moderateBatch(texts, threshold = DEFAULTS.THRESHOLD) {
|
|
47
54
|
if (!this.initialized)
|
|
48
55
|
await this.initialize();
|
|
49
|
-
return Promise.all(texts.map(t => this.moderate(t, threshold)));
|
|
56
|
+
return Promise.all(texts.map((t) => this.moderate(t, threshold)));
|
|
50
57
|
}
|
|
51
58
|
/**
|
|
52
59
|
* Compute softmax probabilities from logits
|
|
@@ -54,29 +61,66 @@ export class ClientContentModeration {
|
|
|
54
61
|
* @returns Array of probabilities (sums to 1)
|
|
55
62
|
*/
|
|
56
63
|
softmax(logits) {
|
|
57
|
-
const
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
64
|
+
const len = logits.length;
|
|
65
|
+
// Edge cases
|
|
66
|
+
if (len === 0)
|
|
67
|
+
return [];
|
|
68
|
+
if (len === 1)
|
|
69
|
+
return [1.0];
|
|
70
|
+
// Use typed array for intermediate calculations when input is Float32Array
|
|
71
|
+
const useTypedArray = logits instanceof Float32Array;
|
|
72
|
+
const output = useTypedArray ? new Float32Array(len) : new Array(len);
|
|
73
|
+
// 1. Find max for numerical stability
|
|
74
|
+
const maximumLogit = this.findMax(logits);
|
|
75
|
+
// 2. Compute exponentials and their sum
|
|
76
|
+
const exponentialSum = this.computeExponentials(logits, maximumLogit, output);
|
|
77
|
+
// 3. Normalize to get probabilities
|
|
78
|
+
this.normalizeProbabilities(output, exponentialSum);
|
|
79
|
+
// Convert to regular array for consistent return type
|
|
80
|
+
return useTypedArray ? Array.from(output) : output;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Find maximum value in array (for numerical stability in softmax)
|
|
84
|
+
*/
|
|
85
|
+
findMax(values) {
|
|
86
|
+
let maxValue = values[0];
|
|
87
|
+
for (const value of Array.from(values).slice(1)) {
|
|
88
|
+
if (value > maxValue) {
|
|
89
|
+
maxValue = value;
|
|
90
|
+
}
|
|
70
91
|
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
92
|
+
return maxValue;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Compute exponentials and return their sum
|
|
96
|
+
* Mutates output array for performance
|
|
97
|
+
*/
|
|
98
|
+
computeExponentials(logits, maximumLogit, output) {
|
|
99
|
+
let exponentialSum = 0;
|
|
100
|
+
Array.from(logits).forEach((logitValue, index) => {
|
|
101
|
+
const exponentialValue = Math.exp(logitValue - maximumLogit);
|
|
102
|
+
output[index] = exponentialValue;
|
|
103
|
+
exponentialSum += exponentialValue;
|
|
104
|
+
});
|
|
105
|
+
return exponentialSum;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Normalize output to sum to 1 (convert to probabilities)
|
|
109
|
+
* Mutates output array for performance
|
|
110
|
+
*/
|
|
111
|
+
normalizeProbabilities(output, exponentialSum) {
|
|
112
|
+
const len = output.length;
|
|
113
|
+
if (!Number.isFinite(exponentialSum) || exponentialSum === 0) {
|
|
114
|
+
// Fallback: uniform distribution
|
|
115
|
+
const uniform = 1 / len;
|
|
116
|
+
output.fill(uniform);
|
|
75
117
|
}
|
|
76
|
-
|
|
77
|
-
|
|
118
|
+
else {
|
|
119
|
+
const inverseSumValue = 1 / exponentialSum;
|
|
120
|
+
Array.from(output).forEach((_, index) => {
|
|
121
|
+
output[index] *= inverseSumValue;
|
|
122
|
+
});
|
|
78
123
|
}
|
|
79
|
-
return output;
|
|
80
124
|
}
|
|
81
125
|
/**
|
|
82
126
|
* Dispose resources and clean up
|
package/lib/model.js
CHANGED
|
@@ -33,14 +33,7 @@ export class ModerationModel {
|
|
|
33
33
|
this.initialized = true;
|
|
34
34
|
}
|
|
35
35
|
catch (error) {
|
|
36
|
-
|
|
37
|
-
const verifyResponse = await fetch(this.modelFileUrl);
|
|
38
|
-
const contentType = verifyResponse.headers.get('content-type') || '';
|
|
39
|
-
if (contentType.includes('text/html')) {
|
|
40
|
-
const text = await verifyResponse.text();
|
|
41
|
-
throw new Error(`Failed to load ONNX model: The URL ${this.modelFileUrl} returned HTML instead of a model file. This usually means the model file path is incorrect. Response preview: ${text.substring(0, 200)}`);
|
|
42
|
-
}
|
|
43
|
-
throw error;
|
|
36
|
+
throw new Error(`Failed to load ONNX model from ${this.modelFileUrl}: ${error instanceof Error ? error.message : String(error)}`);
|
|
44
37
|
}
|
|
45
38
|
}
|
|
46
39
|
async predict(inputIds, attentionMask) {
|
|
@@ -56,14 +49,11 @@ export class ModerationModel {
|
|
|
56
49
|
token_type_ids: new runtime.Tensor(ONNX_CONFIG.TENSOR_TYPE_INT64, tokenTypeIds, [1, tokenTypeIds.length]),
|
|
57
50
|
};
|
|
58
51
|
const output = await this.session.run(feeds);
|
|
59
|
-
const logits = (output.logits ||
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
throw new Error("Model output does not contain logits");
|
|
52
|
+
const logits = (output.logits || output[Object.keys(output)[0]]);
|
|
53
|
+
if (!logits || !('data' in logits)) {
|
|
54
|
+
throw new Error('Model output does not contain logits');
|
|
63
55
|
}
|
|
64
|
-
return logits.data instanceof Float32Array
|
|
65
|
-
? logits.data
|
|
66
|
-
: new Float32Array(logits.data);
|
|
56
|
+
return logits.data instanceof Float32Array ? logits.data : new Float32Array(logits.data);
|
|
67
57
|
}
|
|
68
58
|
dispose() {
|
|
69
59
|
this.session = null;
|
package/lib/tokenizer.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ export declare class Tokenizer {
|
|
|
4
4
|
private readonly maxLength;
|
|
5
5
|
private readonly reservedTokens;
|
|
6
6
|
constructor(maxLength?: number);
|
|
7
|
-
initialize(
|
|
7
|
+
initialize(): Promise<void>;
|
|
8
8
|
private tokenizeText;
|
|
9
9
|
private findSubwordTokens;
|
|
10
10
|
private padTokens;
|
package/lib/tokenizer.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { ERROR_MESSAGES, SPECIAL_TOKENS, DEFAULT_TOKEN_IDS, DEFAULTS
|
|
1
|
+
import { ERROR_MESSAGES, SPECIAL_TOKENS, DEFAULT_TOKEN_IDS, DEFAULTS } from './utils/constants.js';
|
|
2
2
|
export class Tokenizer {
|
|
3
3
|
tokenizer = null;
|
|
4
4
|
maxLength;
|
|
@@ -6,44 +6,39 @@ export class Tokenizer {
|
|
|
6
6
|
constructor(maxLength) {
|
|
7
7
|
this.maxLength = maxLength ?? DEFAULTS.MAX_LENGTH;
|
|
8
8
|
}
|
|
9
|
-
async initialize(
|
|
9
|
+
async initialize() {
|
|
10
10
|
if (this.tokenizer)
|
|
11
11
|
return;
|
|
12
|
-
const url = path ?? TOKENIZER_PATH;
|
|
13
12
|
try {
|
|
14
|
-
const
|
|
15
|
-
|
|
16
|
-
throw new Error(`Failed to load tokenizer: HTTP ${res.status} at ${url}`);
|
|
17
|
-
}
|
|
18
|
-
const contentType = res.headers.get('content-type') || '';
|
|
19
|
-
if (contentType.includes('text/html')) {
|
|
20
|
-
const text = await res.text();
|
|
21
|
-
throw new Error(`Failed to load tokenizer: The URL ${url} returned HTML instead of JSON. This usually means the path is incorrect. Response preview: ${text.substring(0, 200)}`);
|
|
22
|
-
}
|
|
23
|
-
const json = await res.json();
|
|
24
|
-
this.tokenizer = { vocab: json.model?.vocab ?? {} };
|
|
13
|
+
const tokenizerData = await import('../models/bert-mini-moderation-output/tokenizer.json');
|
|
14
|
+
this.tokenizer = { vocab: tokenizerData.model?.vocab ?? {} };
|
|
25
15
|
}
|
|
26
16
|
catch (error) {
|
|
27
|
-
|
|
28
|
-
throw error;
|
|
29
|
-
}
|
|
30
|
-
throw new Error(`Failed to initialize tokenizer at ${url}: ${error instanceof Error ? error.message : String(error)}`);
|
|
17
|
+
throw new Error(`Failed to load tokenizer: ${error instanceof Error ? error.message : String(error)}`);
|
|
31
18
|
}
|
|
32
19
|
}
|
|
33
|
-
tokenizeText(text, vocab,
|
|
34
|
-
const
|
|
20
|
+
tokenizeText(text, vocab, unknownTokenId) {
|
|
21
|
+
const rawTokens = text
|
|
22
|
+
.toLowerCase()
|
|
23
|
+
.split(/\s+/)
|
|
24
|
+
.filter((t) => t.length > 0);
|
|
35
25
|
const tokens = [];
|
|
36
|
-
for (const
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
26
|
+
for (const rawToken of rawTokens) {
|
|
27
|
+
const parts = rawToken.split(/([^\p{L}\p{N}])/gu).filter((p) => p.length > 0);
|
|
28
|
+
for (const part of parts) {
|
|
29
|
+
if (vocab[part])
|
|
30
|
+
tokens.push(vocab[part]);
|
|
31
|
+
else
|
|
32
|
+
tokens.push(...this.findSubwordTokens(part, vocab, unknownTokenId));
|
|
33
|
+
if (tokens.length >= this.maxLength - this.reservedTokens)
|
|
34
|
+
break;
|
|
35
|
+
}
|
|
41
36
|
if (tokens.length >= this.maxLength - this.reservedTokens)
|
|
42
37
|
break;
|
|
43
38
|
}
|
|
44
39
|
return tokens.slice(0, this.maxLength - this.reservedTokens);
|
|
45
40
|
}
|
|
46
|
-
findSubwordTokens(word, vocab,
|
|
41
|
+
findSubwordTokens(word, vocab, unknownTokenId) {
|
|
47
42
|
const lengths = Array.from({ length: word.length }, (_, k) => word.length - k);
|
|
48
43
|
for (const i of lengths) {
|
|
49
44
|
const subword = word.substring(0, i);
|
|
@@ -52,18 +47,18 @@ export class Tokenizer {
|
|
|
52
47
|
const remaining = word.substring(i);
|
|
53
48
|
if (remaining) {
|
|
54
49
|
const subwordToken = `${SPECIAL_TOKENS.SUBWORD_PREFIX}${remaining}`;
|
|
55
|
-
tokens.push(vocab[subwordToken] ??
|
|
50
|
+
tokens.push(vocab[subwordToken] ?? unknownTokenId);
|
|
56
51
|
}
|
|
57
52
|
return tokens;
|
|
58
53
|
}
|
|
59
54
|
}
|
|
60
|
-
return [
|
|
55
|
+
return [unknownTokenId];
|
|
61
56
|
}
|
|
62
|
-
padTokens(tokens,
|
|
57
|
+
padTokens(tokens, paddingTokenId) {
|
|
63
58
|
const inputIds = tokens.slice(0, this.maxLength);
|
|
64
59
|
const attentionMask = inputIds.map(() => 1);
|
|
65
60
|
while (inputIds.length < this.maxLength) {
|
|
66
|
-
inputIds.push(
|
|
61
|
+
inputIds.push(paddingTokenId);
|
|
67
62
|
attentionMask.push(0);
|
|
68
63
|
}
|
|
69
64
|
return { inputIds, attentionMask };
|
|
@@ -74,8 +69,8 @@ export class Tokenizer {
|
|
|
74
69
|
if (!this.tokenizer)
|
|
75
70
|
throw new Error(ERROR_MESSAGES.TOKENIZER_NOT_INITIALIZED);
|
|
76
71
|
const vocab = this.tokenizer.vocab ?? {};
|
|
77
|
-
const
|
|
78
|
-
return this.tokenizeText(text, vocab,
|
|
72
|
+
const unknownTokenId = vocab[SPECIAL_TOKENS.UNK] ?? DEFAULT_TOKEN_IDS.UNK;
|
|
73
|
+
return this.tokenizeText(text, vocab, unknownTokenId);
|
|
79
74
|
}
|
|
80
75
|
async encodeChunks(text) {
|
|
81
76
|
if (!this.tokenizer)
|
|
@@ -84,24 +79,24 @@ export class Tokenizer {
|
|
|
84
79
|
throw new Error(ERROR_MESSAGES.TOKENIZER_NOT_INITIALIZED);
|
|
85
80
|
const raw = await this.rawTokenize(text);
|
|
86
81
|
const vocab = this.tokenizer.vocab ?? {};
|
|
87
|
-
const
|
|
88
|
-
const
|
|
89
|
-
const
|
|
82
|
+
const paddingTokenId = vocab[SPECIAL_TOKENS.PAD] ?? DEFAULT_TOKEN_IDS.PAD;
|
|
83
|
+
const classificationTokenId = vocab[SPECIAL_TOKENS.CLS] ?? DEFAULT_TOKEN_IDS.CLS;
|
|
84
|
+
const separatorTokenId = vocab[SPECIAL_TOKENS.SEP] ?? DEFAULT_TOKEN_IDS.SEP;
|
|
90
85
|
if (raw.length === 0)
|
|
91
|
-
return [this.createEmptyChunk(
|
|
86
|
+
return [this.createEmptyChunk(classificationTokenId, separatorTokenId, paddingTokenId)];
|
|
92
87
|
const chunks = [];
|
|
93
88
|
const chunkSize = this.maxLength - this.reservedTokens;
|
|
94
89
|
const numChunks = Math.max(1, Math.ceil(raw.length / chunkSize));
|
|
95
90
|
const starts = Array.from({ length: numChunks }, (_, k) => k * chunkSize);
|
|
96
91
|
for (const start of starts) {
|
|
97
92
|
const slice = raw.slice(start, start + chunkSize);
|
|
98
|
-
chunks.push(this.padTokens([
|
|
93
|
+
chunks.push(this.padTokens([classificationTokenId, ...slice, separatorTokenId], paddingTokenId));
|
|
99
94
|
}
|
|
100
95
|
return chunks;
|
|
101
96
|
}
|
|
102
|
-
createEmptyChunk(
|
|
103
|
-
const inputIds = [
|
|
104
|
-
const attentionMask = inputIds.map((id) => (id ===
|
|
97
|
+
createEmptyChunk(classificationTokenId, separatorTokenId, paddingTokenId) {
|
|
98
|
+
const inputIds = [classificationTokenId, separatorTokenId, ...Array(this.maxLength - this.reservedTokens).fill(paddingTokenId)];
|
|
99
|
+
const attentionMask = inputIds.map((id) => (id === paddingTokenId ? 0 : 1));
|
|
105
100
|
return { inputIds: inputIds.slice(0, this.maxLength), attentionMask };
|
|
106
101
|
}
|
|
107
102
|
async encode(text) {
|
package/lib/utils/constants.d.ts
CHANGED
package/lib/utils/constants.js
CHANGED
|
@@ -1,55 +1,31 @@
|
|
|
1
1
|
export const ONNX_CONFIG = {
|
|
2
|
-
EXECUTION_PROVIDER_WASM:
|
|
3
|
-
GRAPH_OPTIMIZATION_LEVEL:
|
|
4
|
-
TENSOR_TYPE_INT64:
|
|
5
|
-
DEFAULT_PROVIDER:
|
|
2
|
+
EXECUTION_PROVIDER_WASM: 'wasm',
|
|
3
|
+
GRAPH_OPTIMIZATION_LEVEL: 'all',
|
|
4
|
+
TENSOR_TYPE_INT64: 'int64',
|
|
5
|
+
DEFAULT_PROVIDER: 'cpu',
|
|
6
6
|
WASM_NUM_THREADS: 1,
|
|
7
7
|
};
|
|
8
|
-
|
|
9
|
-
const
|
|
10
|
-
function getModelPath(filename) {
|
|
11
|
-
const basePath = `/node_modules/${packageName}/models/bert-mini-moderation-output/${filename}`;
|
|
12
|
-
if (typeof window !== 'undefined' && window.location) {
|
|
13
|
-
return new URL(basePath, window.location.origin).toString();
|
|
14
|
-
}
|
|
15
|
-
try {
|
|
16
|
-
if (typeof import.meta !== 'undefined' && import.meta.url) {
|
|
17
|
-
const currentUrl = new URL(import.meta.url);
|
|
18
|
-
const pathname = currentUrl.pathname;
|
|
19
|
-
const packageIndex = pathname.indexOf(packageName);
|
|
20
|
-
if (packageIndex !== -1) {
|
|
21
|
-
const packageBasePath = pathname.substring(0, packageIndex + packageName.length);
|
|
22
|
-
const packageBaseUrl = new URL(packageBasePath + '/', currentUrl.origin);
|
|
23
|
-
return new URL(`models/bert-mini-moderation-output/${filename}`, packageBaseUrl).toString();
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
}
|
|
27
|
-
catch {
|
|
28
|
-
// Fall through to return basePath
|
|
29
|
-
}
|
|
30
|
-
return basePath;
|
|
31
|
-
}
|
|
32
|
-
export const MODEL_PATH = getModelPath("model.int8.onnx");
|
|
33
|
-
export const TOKENIZER_PATH = getModelPath("tokenizer.json");
|
|
8
|
+
const MODEL_FILE_PATH = '../../models/bert-mini-moderation-output/model.int8.onnx';
|
|
9
|
+
export const MODEL_PATH = new URL(MODEL_FILE_PATH, import.meta.url).href;
|
|
34
10
|
export const DEFAULTS = {
|
|
35
11
|
THRESHOLD: 0.5,
|
|
36
12
|
MAX_LENGTH: 128,
|
|
37
13
|
};
|
|
38
14
|
export const ERROR_MESSAGES = {
|
|
39
|
-
ONNX_RUNTIME_NOT_AVAILABLE:
|
|
15
|
+
ONNX_RUNTIME_NOT_AVAILABLE: 'ONNX Runtime not available. Please ensure onnxruntime-web is properly installed.',
|
|
40
16
|
TOKENIZER_INIT_FAILED: "Failed to initialize tokenizer. Please provide valid tokenizer URL (e.g., '/models/tokenizer.json').",
|
|
41
|
-
SESSION_NOT_INITIALIZED:
|
|
42
|
-
TOKENIZER_NOT_INITIALIZED:
|
|
17
|
+
SESSION_NOT_INITIALIZED: 'Model session is not initialized. Please call initialize() first.',
|
|
18
|
+
TOKENIZER_NOT_INITIALIZED: 'Tokenizer is not initialized. Please call initialize() first.',
|
|
43
19
|
};
|
|
44
|
-
export const MODEL_NAME =
|
|
20
|
+
export const MODEL_NAME = 'asafaya/bert-mini-arabic';
|
|
45
21
|
export const MAX_LENGTH = 128;
|
|
46
22
|
export const NUM_LABELS = 2;
|
|
47
23
|
export const SPECIAL_TOKENS = {
|
|
48
|
-
PAD:
|
|
49
|
-
CLS:
|
|
50
|
-
SEP:
|
|
51
|
-
UNK:
|
|
52
|
-
SUBWORD_PREFIX:
|
|
24
|
+
PAD: '[PAD]',
|
|
25
|
+
CLS: '[CLS]',
|
|
26
|
+
SEP: '[SEP]',
|
|
27
|
+
UNK: '[UNK]',
|
|
28
|
+
SUBWORD_PREFIX: '##',
|
|
53
29
|
};
|
|
54
30
|
export const DEFAULT_TOKEN_IDS = {
|
|
55
31
|
PAD: 0,
|
|
@@ -130,21 +130,13 @@
|
|
|
130
130
|
"special_tokens": {
|
|
131
131
|
"[CLS]": {
|
|
132
132
|
"id": "[CLS]",
|
|
133
|
-
"ids": [
|
|
134
|
-
|
|
135
|
-
],
|
|
136
|
-
"tokens": [
|
|
137
|
-
"[CLS]"
|
|
138
|
-
]
|
|
133
|
+
"ids": [2],
|
|
134
|
+
"tokens": ["[CLS]"]
|
|
139
135
|
},
|
|
140
136
|
"[SEP]": {
|
|
141
137
|
"id": "[SEP]",
|
|
142
|
-
"ids": [
|
|
143
|
-
|
|
144
|
-
],
|
|
145
|
-
"tokens": [
|
|
146
|
-
"[SEP]"
|
|
147
|
-
]
|
|
138
|
+
"ids": [3],
|
|
139
|
+
"tokens": ["[SEP]"]
|
|
148
140
|
}
|
|
149
141
|
}
|
|
150
142
|
},
|
|
@@ -32161,4 +32153,4 @@
|
|
|
32161
32153
|
"للتسويق": 31999
|
|
32162
32154
|
}
|
|
32163
32155
|
}
|
|
32164
|
-
}
|
|
32156
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@masteryhub-its/speakout-local-client-model",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.2",
|
|
4
4
|
"description": "Local text moderation library using an Arabic MiniBERT model with ONNX Runtime (Web/Browser)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.js",
|
|
@@ -25,7 +25,11 @@
|
|
|
25
25
|
"ml:train": "python src/training/trainer.py",
|
|
26
26
|
"ml:preprocess": "python src/data_processing/pipeline.py",
|
|
27
27
|
"ml:optimize": "python src/worker/run.py",
|
|
28
|
-
"test": "node --test"
|
|
28
|
+
"test": "node --test",
|
|
29
|
+
"format": "prettier --write .",
|
|
30
|
+
"format:check": "prettier --check .",
|
|
31
|
+
"format:py": "black src/ tests/",
|
|
32
|
+
"format:py:check": "black --check src/ tests/"
|
|
29
33
|
},
|
|
30
34
|
"keywords": [
|
|
31
35
|
"moderation",
|
|
@@ -43,11 +47,12 @@
|
|
|
43
47
|
},
|
|
44
48
|
"devDependencies": {
|
|
45
49
|
"@types/node": "^20.10.0",
|
|
46
|
-
"
|
|
47
|
-
"ts-node": "^10.9.2"
|
|
50
|
+
"prettier": "^3.8.1",
|
|
51
|
+
"ts-node": "^10.9.2",
|
|
52
|
+
"typescript": "^5.3.3"
|
|
48
53
|
},
|
|
49
54
|
"repository": {
|
|
50
55
|
"type": "git",
|
|
51
56
|
"url": "https://gitlab.masteryhub-its.com/masteryhub-its/speakout-platform-local-model.git"
|
|
52
57
|
}
|
|
53
|
-
}
|
|
58
|
+
}
|