@ai-pip/csl 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/layers/csl/index.ts +1 -0
  2. package/layers/csl/src/adapters/index.ts +10 -0
  3. package/layers/csl/src/adapters/input/DOMAdapter.ts +236 -0
  4. package/layers/csl/src/adapters/input/UIAdapter.ts +0 -0
  5. package/layers/csl/src/adapters/output/ConsoleLogger.ts +34 -0
  6. package/layers/csl/src/adapters/output/CryptoHashGenerator.ts +29 -0
  7. package/layers/csl/src/adapters/output/FilePolicyRepository.ts +0 -0
  8. package/layers/csl/src/adapters/output/InMemoryPolicyRepository.ts +135 -0
  9. package/layers/csl/src/adapters/output/SystemTimestampProvider.ts +9 -0
  10. package/layers/csl/src/domain/entities/CSLResult.ts +309 -0
  11. package/layers/csl/src/domain/entities/Segment.ts +338 -0
  12. package/layers/csl/src/domain/entities/index.ts +2 -0
  13. package/layers/csl/src/domain/exceptions/ClassificationError.ts +26 -0
  14. package/layers/csl/src/domain/exceptions/SegmentationError.ts +30 -0
  15. package/layers/csl/src/domain/exceptions/index.ts +2 -0
  16. package/layers/csl/src/domain/index.ts +4 -0
  17. package/layers/csl/src/domain/services/AnomalyService.ts +255 -0
  18. package/layers/csl/src/domain/services/LineageService.ts +224 -0
  19. package/layers/csl/src/domain/services/NormalizationService.ts +392 -0
  20. package/layers/csl/src/domain/services/OriginClassificationService.ts +69 -0
  21. package/layers/csl/src/domain/services/PiDetectionService.ts +475 -0
  22. package/layers/csl/src/domain/services/PolicyService.ts +296 -0
  23. package/layers/csl/src/domain/services/SegmentClassificationService.ts +105 -0
  24. package/layers/csl/src/domain/services/SerializationService.ts +229 -0
  25. package/layers/csl/src/domain/services/index.ts +7 -0
  26. package/layers/csl/src/domain/value-objects/AnomalyScore.ts +23 -0
  27. package/layers/csl/src/domain/value-objects/ContentHash.ts +54 -0
  28. package/layers/csl/src/domain/value-objects/LineageEntry.ts +42 -0
  29. package/layers/csl/src/domain/value-objects/Origin-map.ts +67 -0
  30. package/layers/csl/src/domain/value-objects/Origin.ts +99 -0
  31. package/layers/csl/src/domain/value-objects/Pattern.ts +221 -0
  32. package/layers/csl/src/domain/value-objects/PiDetection.ts +140 -0
  33. package/layers/csl/src/domain/value-objects/PiDetectionResult.ts +275 -0
  34. package/layers/csl/src/domain/value-objects/PolicyRule.ts +151 -0
  35. package/layers/csl/src/domain/value-objects/TrustLevel.ts +34 -0
  36. package/layers/csl/src/domain/value-objects/index.ts +10 -0
  37. package/layers/csl/src/index.ts +7 -0
  38. package/layers/csl/src/ports/index.ts +10 -0
  39. package/layers/csl/src/ports/input/ClassificationPort.ts +76 -0
  40. package/layers/csl/src/ports/input/SegmentationPort.ts +81 -0
  41. package/layers/csl/src/ports/output/DOMAdapter.ts +14 -0
  42. package/layers/csl/src/ports/output/HashGenerator.ts +18 -0
  43. package/layers/csl/src/ports/output/Logger.ts +17 -0
  44. package/layers/csl/src/ports/output/PolicyRepository.ts +29 -0
  45. package/layers/csl/src/ports/output/SegmentClassified.ts +8 -0
  46. package/layers/csl/src/ports/output/TimeStampProvider.ts +5 -0
  47. package/layers/csl/src/services/CSLService.ts +393 -0
  48. package/layers/csl/src/services/index.ts +1 -0
  49. package/layers/csl/src/types/entities-types.ts +37 -0
  50. package/layers/csl/src/types/index.ts +4 -0
  51. package/layers/csl/src/types/pi-types.ts +111 -0
  52. package/layers/csl/src/types/port-output-types.ts +17 -0
  53. package/layers/csl/src/types/value-objects-types.ts +213 -0
  54. package/layers/csl/src/utils/colors.ts +25 -0
  55. package/layers/csl/src/utils/pattern-helpers.ts +174 -0
  56. package/package.json +4 -5
  57. package/src/index.ts +36 -36
@@ -0,0 +1,392 @@
1
+ /**
2
+ * NormalizationService provides normalization of content for consistent processing.
3
+ *
4
+ * @remarks
5
+ * This service standardizes content to eliminate hidden manipulation vectors.
6
+ * It applies multiple normalization steps to ensure content is in a safe, uniform format.
7
+ *
8
+ * **Normalization Steps:**
9
+ * 1. Unicode normalization (NFC/NFKC)
10
+ * 2. Removal of invisible characters
11
+ * 3. HTML entity decoding
12
+ * 4. Whitespace and line break normalization
13
+ * 5. Encoding unification to UTF-8
14
+ *
15
+ * **Security Benefits:**
16
+ * - Prevents attacks based on invisible characters
17
+ * - Eliminates encoding-based manipulation
18
+ * - Reduces ambiguity in content processing
19
+ * - Standardizes content for consistent analysis
20
+ *
21
+ * **Usage:**
22
+ * All methods are static since the service has no state or dependencies.
23
+ *
24
+ * @example
25
+ * ```typescript
26
+ * const normalized = NormalizationService.normalize('Hello\u200B\u200Cworld')
27
+ * // Returns: 'Helloworld'
28
+ *
29
+ * // Or use individual methods
30
+ * const unicodeNormalized = NormalizationService.normalizeUnicode('café')
31
+ * ```
32
+ */
33
+ export class NormalizationService {
34
+ /**
35
+ * Main normalization method that applies all normalization steps
36
+ *
37
+ * @param content - The content string to normalize
38
+ * @returns Normalized content string
39
+ *
40
+ * @throws {TypeError} If content is not a string
41
+ *
42
+ * @example
43
+ * ```typescript
44
+ * const normalized = NormalizationService.normalize('Hello\u200Bworld')
45
+ * // Returns: 'Helloworld'
46
+ * ```
47
+ */
48
+ static normalize(content: string): string {
49
+ // 1. Validación
50
+ if (typeof content !== 'string') {
51
+ throw new TypeError('NormalizationService.normalize: content must be a string')
52
+ }
53
+
54
+ if (content.length === 0) {
55
+ return content
56
+ }
57
+
58
+ let normalized = content
59
+
60
+ // 2. Remove BOM (primero)
61
+ normalized = this.removeBOM(normalized)
62
+
63
+ // 3. Unicode normalization
64
+ normalized = this.normalizeUnicode(normalized)
65
+
66
+ // 4. Remove invisible chars
67
+ normalized = this.removeInvisibleCharacters(normalized)
68
+
69
+ // 5. Decode HTML entities
70
+ normalized = this.decodeHtmlEntities(normalized)
71
+
72
+ // 6. Normalize whitespace
73
+ normalized = this.normalizeWhitespace(normalized)
74
+
75
+ // 7. Normalize line breaks
76
+ normalized = this.normalizeLineBreaks(normalized)
77
+
78
+ // 8. Remove control characters
79
+ normalized = this.removeControlCharacters(normalized)
80
+
81
+ // 9. Trim
82
+ normalized = this.trimWhitespace(normalized)
83
+
84
+ return normalized
85
+ }
86
+
87
+ /**
88
+ * Normalizes Unicode characters to NFC (Normalization Form Canonical Composition)
89
+ *
90
+ * @param content - Content to normalize
91
+ * @returns Unicode-normalized content
92
+ *
93
+ * @throws {TypeError} If content is not a string
94
+ *
95
+ * @example
96
+ * ```typescript
97
+ * const normalized = NormalizationService.normalizeUnicode('café')
98
+ * // Ensures canonical form
99
+ * ```
100
+ */
101
+ static normalizeUnicode(content: string): string {
102
+ if (typeof content !== 'string') {
103
+ throw new TypeError('NormalizationService.normalizeUnicode: content must be a string')
104
+ }
105
+
106
+ // Normalize to NFC (Canonical Composition)
107
+ // This ensures characters are in their canonical form
108
+ return content.normalize('NFC')
109
+ }
110
+
111
+ /**
112
+ * Removes invisible zero-width characters that can be used for attacks
113
+ *
114
+ * @param content - Content to clean
115
+ * @returns Content without invisible characters
116
+ *
117
+ * @throws {TypeError} If content is not a string
118
+ *
119
+ * @example
120
+ * ```typescript
121
+ * const cleaned = NormalizationService.removeInvisibleCharacters('Hello\u200Bworld')
122
+ * // Returns: 'Helloworld'
123
+ * ```
124
+ */
125
+ static removeInvisibleCharacters(content: string): string {
126
+ if (typeof content !== 'string') {
127
+ throw new TypeError('NormalizationService.removeInvisibleCharacters: content must be a string')
128
+ }
129
+
130
+ const INVISIBLE_CHARS = new Set([
131
+ '\u200B', // Zero-width space
132
+ '\u200C', // Zero-width non-joiner
133
+ '\u200D', // Zero-width joiner
134
+ '\uFEFF', // Zero-width no-break space (BOM)
135
+ '\u2060', // Word joiner
136
+ '\u200E', // Left-to-right mark
137
+ '\u200F', // Right-to-left mark
138
+ '\u202A', // Left-to-right embedding
139
+ '\u202B', // Right-to-left embedding
140
+ '\u202C', // Pop directional formatting
141
+ '\u202D', // Left-to-right override
142
+ '\u202E', // Right-to-left override
143
+ '\u202F', // Narrow no-break space
144
+ '\u00AD', // Soft hyphen
145
+ ])
146
+
147
+ return [...content].filter(char => !INVISIBLE_CHARS.has(char)).join('')
148
+ }
149
+
150
+ /**
151
+ * Removes BOM (Byte Order Mark) from the beginning of content
152
+ *
153
+ * @param content - Content that may contain BOM
154
+ * @returns Content without BOM
155
+ *
156
+ * @throws {TypeError} If content is not a string
157
+ */
158
+ static removeBOM(content: string): string {
159
+ if (typeof content !== 'string') {
160
+ throw new TypeError('NormalizationService.removeBOM: content must be a string')
161
+ }
162
+
163
+ // Remove UTF-8 BOM if present (U+FEFF)
164
+ if (content.length > 0 && content.codePointAt(0) === 0xFEFF) {
165
+ return content.slice(1)
166
+ }
167
+
168
+ return content
169
+ }
170
+
171
+ /**
172
+ * Decodes HTML entities to their actual characters
173
+ *
174
+ * @param content - Content with HTML entities
175
+ * @returns Content with decoded entities
176
+ *
177
+ * @throws {TypeError} If content is not a string
178
+ *
179
+ * @example
180
+ * ```typescript
181
+ * const decoded = NormalizationService.decodeHtmlEntities('<script>')
182
+ * // Returns: '<script>'
183
+ * ```
184
+ */
185
+ static decodeHtmlEntities(content: string): string {
186
+ if (typeof content !== 'string') {
187
+ throw new TypeError('NormalizationService.decodeHtmlEntities: content must be a string')
188
+ }
189
+
190
+ // Use browser's DOMParser if available (for browser environments)
191
+ if (typeof DOMParser !== 'undefined') {
192
+ try {
193
+ const parser = new DOMParser()
194
+ const doc = parser.parseFromString(content, 'text/html')
195
+ const decoded = doc.documentElement.textContent || content
196
+ // If DOMParser fails, it might return the original with error, so validate
197
+ if (decoded !== content || !content.includes('&')) {
198
+ return decoded
199
+ }
200
+ } catch {
201
+ // Fall through to manual decoding
202
+ }
203
+ }
204
+
205
+ // Manual decoding for Node.js or when DOMParser is not available
206
+ // Common HTML entities
207
+ const entityMap: Record<string, string> = {
208
+ '&lt;': '<',
209
+ '&gt;': '>',
210
+ '&amp;': '&',
211
+ '&quot;': '"',
212
+ '&apos;': "'",
213
+ '&nbsp;': ' ',
214
+ '&copy;': '©',
215
+ '&reg;': '®',
216
+ '&trade;': '™',
217
+ '&hellip;': '…',
218
+ '&mdash;': '—',
219
+ '&ndash;': '–',
220
+ }
221
+
222
+ let decoded = content
223
+
224
+ // Decode named entities (must be done before numeric to avoid conflicts)
225
+ for (const [entity, char] of Object.entries(entityMap)) {
226
+ decoded = decoded.replaceAll(entity, char)
227
+ }
228
+
229
+ // Decode numeric entities (&#123; format)
230
+ decoded = decoded.replaceAll(/&#(\d+);/g, (_, num) => {
231
+ const codePoint = Number.parseInt(num, 10)
232
+ // Only decode valid Unicode code points (0-0x10FFFF)
233
+ if (codePoint >= 0 && codePoint <= 0x10FFFF) {
234
+ try {
235
+ return String.fromCodePoint(codePoint)
236
+ } catch {
237
+ return `&#${num};` // Keep original if invalid
238
+ }
239
+ }
240
+ return `&#${num};`
241
+ })
242
+
243
+ // Decode hexadecimal entities (&#x1F; format)
244
+ decoded = decoded.replaceAll(/&#x([0-9A-Fa-f]+);/g, (_, hex) => {
245
+ const codePoint = Number.parseInt(hex, 16)
246
+ // Only decode valid Unicode code points
247
+ if (codePoint >= 0 && codePoint <= 0x10FFFF) {
248
+ try {
249
+ return String.fromCodePoint(codePoint)
250
+ } catch {
251
+ return `&#x${hex};` // Keep original if invalid
252
+ }
253
+ }
254
+ return `&#x${hex};`
255
+ })
256
+
257
+ return decoded
258
+ }
259
+
260
+ /**
261
+ * Normalizes whitespace by collapsing multiple spaces into single spaces
262
+ *
263
+ * @param content - Content with potentially excessive whitespace
264
+ * @returns Content with normalized whitespace
265
+ *
266
+ * @throws {TypeError} If content is not a string
267
+ *
268
+ * @example
269
+ * ```typescript
270
+ * const normalized = NormalizationService.normalizeWhitespace('Hello world')
271
+ * // Returns: 'Hello world'
272
+ * ```
273
+ */
274
+ static normalizeWhitespace(content: string): string {
275
+ if (typeof content !== 'string') {
276
+ throw new TypeError('NormalizationService.normalizeWhitespace: content must be a string')
277
+ }
278
+
279
+ // Replace multiple spaces/tabs with single space
280
+ // Preserves single spaces and tabs, but collapses multiple
281
+ return content.replaceAll(/[ \t]+/g, ' ')
282
+ }
283
+
284
+ /**
285
+ * Normalizes line breaks to Unix-style (\n)
286
+ *
287
+ * @param content - Content with mixed line breaks
288
+ * @returns Content with normalized line breaks
289
+ *
290
+ * @throws {TypeError} If content is not a string
291
+ *
292
+ * @example
293
+ * ```typescript
294
+ * const normalized = NormalizationService.normalizeLineBreaks('Hello\r\nworld\r')
295
+ * // Returns: 'Hello\nworld\n'
296
+ * ```
297
+ */
298
+ static normalizeLineBreaks(content: string): string {
299
+ if (typeof content !== 'string') {
300
+ throw new TypeError('NormalizationService.normalizeLineBreaks: content must be a string')
301
+ }
302
+
303
+ // Normalize all line break variants to \n
304
+ return content
305
+ .replaceAll('\r\n', '\n') // Windows (\r\n) → \n
306
+ .replaceAll('\r', '\n') // Old Mac (\r) → \n
307
+ }
308
+
309
+ /**
310
+ * Removes control characters except newline (\n) and tab (\t)
311
+ *
312
+ * @param content - Content that may contain control characters
313
+ * @returns Content without control characters
314
+ *
315
+ * @throws {TypeError} If content is not a string
316
+ *
317
+ * @example
318
+ * ```typescript
319
+ * const cleaned = NormalizationService.removeControlCharacters('Hello\x00world')
320
+ * // Returns: 'Helloworld'
321
+ * ```
322
+ */
323
+ static removeControlCharacters(content: string): string {
324
+ if (typeof content !== 'string') {
325
+ throw new TypeError('NormalizationService.removeControlCharacters: content must be a string')
326
+ }
327
+
328
+ // Remove control characters (0x00-0x1F) except \n (0x0A) and \t (0x09)
329
+ // Also remove DEL (0x7F) and other high-range control chars (0x80-0x9F)
330
+ // Using Unicode escapes to avoid linter warnings about control characters
331
+ // eslint-disable-next-line no-control-regex
332
+ const CONTROL_CHAR_REGEX = /[\u0000-\u0008\u000B-\u001F\u007F-\u009F]/g
333
+ return content.replaceAll(CONTROL_CHAR_REGEX, '')
334
+ }
335
+
336
+ /**
337
+ * Trims leading and trailing whitespace
338
+ *
339
+ * @param content - Content to trim
340
+ * @returns Trimmed content
341
+ *
342
+ * @throws {TypeError} If content is not a string
343
+ *
344
+ * @example
345
+ * ```typescript
346
+ * const trimmed = NormalizationService.trimWhitespace(' Hello ')
347
+ * // Returns: 'Hello'
348
+ * ```
349
+ */
350
+ static trimWhitespace(content: string): string {
351
+ if (typeof content !== 'string') {
352
+ throw new TypeError('NormalizationService.trimWhitespace: content must be a string')
353
+ }
354
+
355
+ return content.trim()
356
+ }
357
+
358
+ /**
359
+ * Validates that content is a valid UTF-8 string
360
+ *
361
+ * @param content - Content to validate
362
+ * @returns true if content is valid UTF-8, false otherwise
363
+ *
364
+ * @example
365
+ * ```typescript
366
+ * const isValid = NormalizationService.isValidUTF8('Hello world')
367
+ * // Returns: true
368
+ * ```
369
+ */
370
+ static isValidUTF8(content: string): boolean {
371
+ if (typeof content !== 'string') {
372
+ return false
373
+ }
374
+
375
+ try {
376
+ // Try to encode and decode to validate UTF-8
377
+ // In browser/Node.js, TextEncoder/TextDecoder validate UTF-8
378
+ if (typeof TextEncoder !== 'undefined' && typeof TextDecoder !== 'undefined') {
379
+ const encoded = new TextEncoder().encode(content)
380
+ const decoder = new TextDecoder('utf-8', { fatal: true })
381
+ const decoded = decoder.decode(encoded)
382
+ return decoded === content
383
+ }
384
+
385
+ // Fallback: basic check for valid string
386
+ // All characters in a JavaScript string are valid UTF-16/UTF-8
387
+ return true
388
+ } catch {
389
+ return false
390
+ }
391
+ }
392
+ }
@@ -0,0 +1,69 @@
1
+ import type { ClassificationPort } from '../../ports';
2
+ import { ClassificationError } from '../exceptions';
3
+ import { TrustLevel, type Origin, originMap } from '../value-objects';
4
+
5
+ /**
6
+ * OriginClassificationService provides deterministic classification of content based on origin.
7
+ *
8
+ * @remarks
9
+ * This service implements the ClassificationPort and provides a pure, deterministic
10
+ * mapping from Origin to TrustLevel. The classification is based solely on the
11
+ * origin type, not on content analysis.
12
+ *
13
+ * **Key Characteristics:**
14
+ * - 100% deterministic: same origin → same trust level, always
15
+ * - No side effects: pure function
16
+ * - No content analysis: only origin type matters
17
+ * - Fast: simple map lookup
18
+ *
19
+ * **Future Extensibility:**
20
+ * - Content analysis can be added as a separate layer
21
+ * - This service remains deterministic and fast
22
+ * - Additional analysis can modify trust level after initial classification
23
+ *
24
+ * @example
25
+ * ```typescript
26
+ * const OriginclassificationService = new OriginClassificationService()
27
+ *
28
+ * // Classify user input (always UC)
29
+ * const userOrigin = new Origin(OriginType.USER)
30
+ * const trustLevel = classificationService.classify(userOrigin)
31
+ * // Returns: TrustLevel(TrustLevelType.UC)
32
+ *
33
+ * // Classify system content (always TC)
34
+ * const systemOrigin = new Origin(OriginType.SYSTEM_GENERATED)
35
+ * const systemTrust = classificationService.classify(systemOrigin)
36
+ * // Returns: TrustLevel(TrustLevelType.TC)
37
+ * ```
38
+ */
39
+ export class OriginClassificationService implements ClassificationPort {
40
+ /**
41
+ * Classifies a content segment based on its origin and returns the trust level.
42
+ *
43
+ * @param origin - The Origin value object representing the source of the content
44
+ *
45
+ * @returns The TrustLevel value object determined by the origin type
46
+ *
47
+ * @throws {ClassificationError} If the origin type is not mapped in originMap
48
+ *
49
+ * @example
50
+ * ```typescript
51
+ * const origin = new Origin(OriginType.DOM_VISIBLE)
52
+ * const trustLevel = OriginClassificationService.classify(origin)
53
+ * // Returns: TrustLevel(TrustLevelType.STC)
54
+ * ```
55
+ */
56
+ classify(origin: Origin): TrustLevel {
57
+ const trustLevelType = originMap.get(origin.type);
58
+
59
+ if (!trustLevelType) {
60
+ throw new ClassificationError(
61
+ `Origin type '${origin.type}' is not mapped in originMap. ` +
62
+ `All OriginType values must have a corresponding TrustLevel mapping.`
63
+ );
64
+ }
65
+
66
+ return new TrustLevel(trustLevelType);
67
+ }
68
+ }
69
+