glin-profanity 3.1.5 → 3.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -566
- package/dist/{types-CdDqSZY7.d.cts → Filter-BGcyIAvO.d.ts} +4 -162
- package/dist/{types-CdDqSZY7.d.ts → Filter-D34Wsmrj.d.cts} +4 -162
- package/dist/frameworks/index.cjs +5257 -0
- package/dist/frameworks/index.d.cts +2 -0
- package/dist/frameworks/index.d.ts +2 -0
- package/dist/frameworks/index.js +5252 -0
- package/dist/frameworks/nextjs.cjs +5257 -0
- package/dist/frameworks/nextjs.d.cts +173 -0
- package/dist/frameworks/nextjs.d.ts +173 -0
- package/dist/frameworks/nextjs.js +5252 -0
- package/dist/index.cjs +151 -85
- package/dist/index.d.cts +5 -29
- package/dist/index.d.ts +5 -29
- package/dist/index.js +152 -85
- package/dist/integrations/index.cjs +6110 -0
- package/dist/integrations/index.d.cts +5 -0
- package/dist/integrations/index.d.ts +5 -0
- package/dist/integrations/index.js +6082 -0
- package/dist/integrations/langchain.cjs +5252 -0
- package/dist/integrations/langchain.d.cts +231 -0
- package/dist/integrations/langchain.d.ts +231 -0
- package/dist/integrations/langchain.js +5239 -0
- package/dist/integrations/openai.cjs +5367 -0
- package/dist/integrations/openai.d.cts +167 -0
- package/dist/integrations/openai.d.ts +167 -0
- package/dist/integrations/openai.js +5362 -0
- package/dist/integrations/semantic.cjs +5314 -0
- package/dist/integrations/semantic.d.cts +268 -0
- package/dist/integrations/semantic.d.ts +268 -0
- package/dist/integrations/semantic.js +5309 -0
- package/dist/integrations/vercel-ai.cjs +5282 -0
- package/dist/integrations/vercel-ai.d.cts +224 -0
- package/dist/integrations/vercel-ai.d.ts +224 -0
- package/dist/integrations/vercel-ai.js +5273 -0
- package/dist/ml/index.cjs +358 -56
- package/dist/ml/index.d.cts +5 -2
- package/dist/ml/index.d.ts +5 -2
- package/dist/ml/index.js +354 -57
- package/dist/ml/transformers.cjs +5237 -0
- package/dist/ml/transformers.d.cts +232 -0
- package/dist/ml/transformers.d.ts +232 -0
- package/dist/ml/transformers.js +5231 -0
- package/dist/multimodal/audio.cjs +5269 -0
- package/dist/multimodal/audio.d.cts +255 -0
- package/dist/multimodal/audio.d.ts +255 -0
- package/dist/multimodal/audio.js +5264 -0
- package/dist/multimodal/index.cjs +5432 -0
- package/dist/multimodal/index.d.cts +4 -0
- package/dist/multimodal/index.d.ts +4 -0
- package/dist/multimodal/index.js +5422 -0
- package/dist/multimodal/ocr.cjs +5193 -0
- package/dist/multimodal/ocr.d.cts +157 -0
- package/dist/multimodal/ocr.d.ts +157 -0
- package/dist/multimodal/ocr.js +5187 -0
- package/dist/react.cjs +5133 -0
- package/dist/react.d.cts +13 -0
- package/dist/react.d.ts +13 -0
- package/dist/react.js +5131 -0
- package/dist/types-B9c_ik4k.d.cts +88 -0
- package/dist/types-B9c_ik4k.d.ts +88 -0
- package/dist/types-BuKh9tvV.d.ts +20 -0
- package/dist/types-Ct_ueYqw.d.cts +76 -0
- package/dist/types-Ct_ueYqw.d.ts +76 -0
- package/dist/types-DI8nzwWc.d.cts +20 -0
- package/package.json +170 -3
package/dist/ml/index.js
CHANGED
|
@@ -810,7 +810,7 @@ var danish_default = {
|
|
|
810
810
|
// ../../shared/dictionaries/dutch.json
|
|
811
811
|
var dutch_default = {
|
|
812
812
|
words: [
|
|
813
|
-
"aardappels
|
|
813
|
+
"aardappels afgieten",
|
|
814
814
|
"achter het raam zitten",
|
|
815
815
|
"afberen",
|
|
816
816
|
"aflebberen",
|
|
@@ -827,7 +827,7 @@ var dutch_default = {
|
|
|
827
827
|
"bagger schijten",
|
|
828
828
|
"balen",
|
|
829
829
|
"bedonderen",
|
|
830
|
-
"
|
|
830
|
+
"befborstel",
|
|
831
831
|
"beffen",
|
|
832
832
|
"bekken",
|
|
833
833
|
"belazeren",
|
|
@@ -836,11 +836,11 @@ var dutch_default = {
|
|
|
836
836
|
"beurt",
|
|
837
837
|
"boemelen",
|
|
838
838
|
"boerelul",
|
|
839
|
-
"
|
|
839
|
+
"boerenpummel",
|
|
840
840
|
"bokkelul",
|
|
841
841
|
"botergeil",
|
|
842
842
|
"broekhoesten",
|
|
843
|
-
"
|
|
843
|
+
"brugpieper",
|
|
844
844
|
"buffelen",
|
|
845
845
|
"buiten de pot piesen",
|
|
846
846
|
"da's kloten van de bok",
|
|
@@ -848,13 +848,13 @@ var dutch_default = {
|
|
|
848
848
|
"de hoer spelen",
|
|
849
849
|
"de hond uitlaten",
|
|
850
850
|
"de koffer induiken",
|
|
851
|
-
"
|
|
851
|
+
"del",
|
|
852
852
|
"de pijp aan maarten geven",
|
|
853
853
|
"de pijp uitgaan",
|
|
854
854
|
"dombo",
|
|
855
|
-
"
|
|
855
|
+
"draaikont",
|
|
856
856
|
"driehoog achter wonen",
|
|
857
|
-
"
|
|
857
|
+
"drol",
|
|
858
858
|
"drooggeiler",
|
|
859
859
|
"droogkloot",
|
|
860
860
|
"een beurt geven",
|
|
@@ -874,7 +874,7 @@ var dutch_default = {
|
|
|
874
874
|
"godverdomme",
|
|
875
875
|
"graftak",
|
|
876
876
|
"gras maaien",
|
|
877
|
-
"
|
|
877
|
+
"gratenkut",
|
|
878
878
|
"greppeldel",
|
|
879
879
|
"griet",
|
|
880
880
|
"hoempert",
|
|
@@ -887,7 +887,7 @@ var dutch_default = {
|
|
|
887
887
|
"huisdealer",
|
|
888
888
|
"johny",
|
|
889
889
|
"kanen",
|
|
890
|
-
"
|
|
890
|
+
"kettingzeug",
|
|
891
891
|
"klaarkomen",
|
|
892
892
|
"klerebeer",
|
|
893
893
|
"klojo",
|
|
@@ -897,22 +897,22 @@ var dutch_default = {
|
|
|
897
897
|
"klootzak",
|
|
898
898
|
"kloten",
|
|
899
899
|
"knor",
|
|
900
|
-
"
|
|
900
|
+
"kont",
|
|
901
901
|
"kontneuken",
|
|
902
902
|
"krentekakker",
|
|
903
903
|
"kut",
|
|
904
904
|
"kuttelikkertje",
|
|
905
|
-
"
|
|
905
|
+
"kwakkie",
|
|
906
906
|
"liefdesgrot",
|
|
907
907
|
"lul",
|
|
908
908
|
"lul-de-behanger",
|
|
909
909
|
"lulhannes",
|
|
910
910
|
"lummel",
|
|
911
911
|
"mafketel",
|
|
912
|
-
"
|
|
912
|
+
"matennaaier",
|
|
913
913
|
"matje",
|
|
914
914
|
"mof",
|
|
915
|
-
"
|
|
915
|
+
"muts",
|
|
916
916
|
"naaien",
|
|
917
917
|
"naakt",
|
|
918
918
|
"neuken",
|
|
@@ -932,9 +932,9 @@ var dutch_default = {
|
|
|
932
932
|
"paal",
|
|
933
933
|
"paardelul",
|
|
934
934
|
"palen",
|
|
935
|
-
"
|
|
935
|
+
"penoze",
|
|
936
936
|
"piesen",
|
|
937
|
-
"
|
|
937
|
+
"pijpbekkie",
|
|
938
938
|
"pijpen",
|
|
939
939
|
"pik",
|
|
940
940
|
"pleurislaaier",
|
|
@@ -949,7 +949,7 @@ var dutch_default = {
|
|
|
949
949
|
"reet",
|
|
950
950
|
"reetridder",
|
|
951
951
|
"reet trappen, voor zijn",
|
|
952
|
-
"
|
|
952
|
+
"remsporen",
|
|
953
953
|
"reutelen",
|
|
954
954
|
"rothoer",
|
|
955
955
|
"rotzak",
|
|
@@ -962,25 +962,25 @@ var dutch_default = {
|
|
|
962
962
|
"schuinsmarcheerder",
|
|
963
963
|
"shit",
|
|
964
964
|
"slempen",
|
|
965
|
-
"
|
|
965
|
+
"slet",
|
|
966
966
|
"sletterig",
|
|
967
967
|
"slik mijn zaad",
|
|
968
|
-
"
|
|
968
|
+
"snol",
|
|
969
969
|
"spuiten",
|
|
970
970
|
"standje",
|
|
971
|
-
"standje-
|
|
971
|
+
"standje-69",
|
|
972
972
|
"stoephoer",
|
|
973
973
|
"stootje",
|
|
974
|
-
"
|
|
975
|
-
"
|
|
974
|
+
"stront",
|
|
975
|
+
"sufferd",
|
|
976
976
|
"tapijtnek",
|
|
977
|
-
"
|
|
977
|
+
"teef",
|
|
978
978
|
"temeier",
|
|
979
979
|
"teringlijer",
|
|
980
980
|
"toeter",
|
|
981
|
-
"
|
|
982
|
-
"
|
|
983
|
-
"trottoir
|
|
981
|
+
"tongzoen",
|
|
982
|
+
"triootje",
|
|
983
|
+
"trottoir prostituee",
|
|
984
984
|
"trottoirteef",
|
|
985
985
|
"vergallen",
|
|
986
986
|
"verkloten",
|
|
@@ -1053,6 +1053,8 @@ var english_default = {
|
|
|
1053
1053
|
"2 girls 1 cup",
|
|
1054
1054
|
"2g1c",
|
|
1055
1055
|
"a$$",
|
|
1056
|
+
"@ss",
|
|
1057
|
+
"4ss",
|
|
1056
1058
|
"acrotomophilia",
|
|
1057
1059
|
"alabama hot pocket",
|
|
1058
1060
|
"alaskan pipeline",
|
|
@@ -1192,6 +1194,10 @@ var english_default = {
|
|
|
1192
1194
|
"eunuch",
|
|
1193
1195
|
"f*ck",
|
|
1194
1196
|
"f@ck",
|
|
1197
|
+
"f4ck",
|
|
1198
|
+
"fvck",
|
|
1199
|
+
"phuck",
|
|
1200
|
+
"fuk",
|
|
1195
1201
|
"faggot",
|
|
1196
1202
|
"fecal",
|
|
1197
1203
|
"felch",
|
|
@@ -1373,6 +1379,9 @@ var english_default = {
|
|
|
1373
1379
|
"shemale",
|
|
1374
1380
|
"shibari",
|
|
1375
1381
|
"shit",
|
|
1382
|
+
"sh1t",
|
|
1383
|
+
"$hit",
|
|
1384
|
+
"$h!t",
|
|
1376
1385
|
"shitblimp",
|
|
1377
1386
|
"shithead",
|
|
1378
1387
|
"shitshow",
|
|
@@ -2476,7 +2485,7 @@ var italian_default = {
|
|
|
2476
2485
|
"di merda",
|
|
2477
2486
|
"ditalino",
|
|
2478
2487
|
"duro",
|
|
2479
|
-
"fare una
|
|
2488
|
+
"fare una sega",
|
|
2480
2489
|
"fava",
|
|
2481
2490
|
"femminuccia",
|
|
2482
2491
|
"fica",
|
|
@@ -2724,7 +2733,6 @@ var japanese_default = {
|
|
|
2724
2733
|
"\u7389\u8210\u3081",
|
|
2725
2734
|
"\u7DCA\u7E1B",
|
|
2726
2735
|
"\u8FD1\u89AA\u76F8\u59E6",
|
|
2727
|
-
"\u5ACC\u3044",
|
|
2728
2736
|
"\u5F8C\u80CC\u4F4D",
|
|
2729
2737
|
"\u5408\u610F\u306E\u6027\u4EA4",
|
|
2730
2738
|
"\u62F7\u554F",
|
|
@@ -2737,7 +2745,6 @@ var japanese_default = {
|
|
|
2737
2745
|
"\u5C04\u7CBE",
|
|
2738
2746
|
"\u624B\u30B3\u30AD",
|
|
2739
2747
|
"\u7363\u59E6",
|
|
2740
|
-
"\u5973\u306E\u5B50",
|
|
2741
2748
|
"\u5973\u738B\u69D8",
|
|
2742
2749
|
"\u5973\u5B50\u9AD8\u751F",
|
|
2743
2750
|
"\u5973\u88C5",
|
|
@@ -2814,7 +2821,6 @@ var turkish_default = {
|
|
|
2814
2821
|
"ak",
|
|
2815
2822
|
"akp",
|
|
2816
2823
|
"al a\u011Fz\u0131na",
|
|
2817
|
-
"allah",
|
|
2818
2824
|
"allahs\u0131z",
|
|
2819
2825
|
"am",
|
|
2820
2826
|
"am biti",
|
|
@@ -2909,7 +2915,6 @@ var turkish_default = {
|
|
|
2909
2915
|
"am\u0131n\u0131 s",
|
|
2910
2916
|
"am\u0131s\u0131na",
|
|
2911
2917
|
"am\u0131s\u0131n\u0131",
|
|
2912
|
-
"ana",
|
|
2913
2918
|
"anaaann",
|
|
2914
2919
|
"anal",
|
|
2915
2920
|
"analarn",
|
|
@@ -3041,8 +3046,6 @@ var turkish_default = {
|
|
|
3041
3046
|
"cikar",
|
|
3042
3047
|
"cim",
|
|
3043
3048
|
"cm",
|
|
3044
|
-
"coca cola",
|
|
3045
|
-
"cola",
|
|
3046
3049
|
"dalaks\u0131z",
|
|
3047
3050
|
"dallama",
|
|
3048
3051
|
"daltassak",
|
|
@@ -3840,7 +3843,7 @@ var turkish_default = {
|
|
|
3840
3843
|
// ../../shared/dictionaries/spanish.json
|
|
3841
3844
|
var spanish_default = {
|
|
3842
3845
|
words: [
|
|
3843
|
-
"
|
|
3846
|
+
"asesinato",
|
|
3844
3847
|
"asno",
|
|
3845
3848
|
"bastardo",
|
|
3846
3849
|
"Bollera",
|
|
@@ -4140,6 +4143,34 @@ var GAMING_POSITIVE = /* @__PURE__ */ new Set([
|
|
|
4140
4143
|
"move",
|
|
4141
4144
|
"combo"
|
|
4142
4145
|
]);
|
|
4146
|
+
var GAMING_ACCEPTABLE_WORDS = /* @__PURE__ */ new Set([
|
|
4147
|
+
"kill",
|
|
4148
|
+
"killer",
|
|
4149
|
+
"killed",
|
|
4150
|
+
"killing",
|
|
4151
|
+
"shoot",
|
|
4152
|
+
"shot",
|
|
4153
|
+
"shooting",
|
|
4154
|
+
"die",
|
|
4155
|
+
"dying",
|
|
4156
|
+
"died",
|
|
4157
|
+
"dead",
|
|
4158
|
+
"death",
|
|
4159
|
+
"badass",
|
|
4160
|
+
"sick",
|
|
4161
|
+
"insane",
|
|
4162
|
+
"crazy",
|
|
4163
|
+
"mad",
|
|
4164
|
+
"beast",
|
|
4165
|
+
"savage",
|
|
4166
|
+
"suck",
|
|
4167
|
+
"sucks",
|
|
4168
|
+
"wtf",
|
|
4169
|
+
"omg",
|
|
4170
|
+
"hell",
|
|
4171
|
+
"damn",
|
|
4172
|
+
"crap"
|
|
4173
|
+
]);
|
|
4143
4174
|
var POSITIVE_PHRASES = /* @__PURE__ */ new Map([
|
|
4144
4175
|
["the bomb", 0.9],
|
|
4145
4176
|
// "this movie is the bomb"
|
|
@@ -4172,7 +4203,9 @@ var ContextAnalyzer = class {
|
|
|
4172
4203
|
constructor(config) {
|
|
4173
4204
|
this.contextWindow = config.contextWindow;
|
|
4174
4205
|
this.language = config.language;
|
|
4175
|
-
this.domainWhitelists = new Set(
|
|
4206
|
+
this.domainWhitelists = new Set(
|
|
4207
|
+
(config.domainWhitelists || []).map((word) => word.toLowerCase())
|
|
4208
|
+
);
|
|
4176
4209
|
}
|
|
4177
4210
|
/**
|
|
4178
4211
|
* Analyzes the context around a profanity match to determine if it should be flagged
|
|
@@ -4209,10 +4242,9 @@ var ContextAnalyzer = class {
|
|
|
4209
4242
|
isWhitelisted: false
|
|
4210
4243
|
};
|
|
4211
4244
|
}
|
|
4212
|
-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
4213
4245
|
checkPhraseContext(contextText, matchWord) {
|
|
4214
4246
|
for (const [phrase, score] of POSITIVE_PHRASES.entries()) {
|
|
4215
|
-
if (contextText.includes(phrase)) {
|
|
4247
|
+
if (phrase.includes(matchWord) && contextText.includes(phrase)) {
|
|
4216
4248
|
return {
|
|
4217
4249
|
contextScore: score,
|
|
4218
4250
|
reason: `Positive phrase detected: "${phrase}"`,
|
|
@@ -4231,21 +4263,29 @@ var ContextAnalyzer = class {
|
|
|
4231
4263
|
}
|
|
4232
4264
|
return null;
|
|
4233
4265
|
}
|
|
4234
|
-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
4235
4266
|
isDomainWhitelisted(contextWords, matchWord) {
|
|
4267
|
+
const normalizedMatchWord = matchWord.toLowerCase();
|
|
4236
4268
|
for (const word of contextWords) {
|
|
4237
|
-
if (this.domainWhitelists.has(word)
|
|
4269
|
+
if (this.domainWhitelists.has(word)) {
|
|
4238
4270
|
return true;
|
|
4239
4271
|
}
|
|
4272
|
+
if (GAMING_POSITIVE.has(word)) {
|
|
4273
|
+
if (GAMING_ACCEPTABLE_WORDS.has(normalizedMatchWord)) {
|
|
4274
|
+
return true;
|
|
4275
|
+
}
|
|
4276
|
+
}
|
|
4240
4277
|
}
|
|
4241
4278
|
return false;
|
|
4242
4279
|
}
|
|
4243
|
-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
4244
4280
|
generateReason(score, contextWords) {
|
|
4281
|
+
const foundPositive = Array.from(new Set(contextWords.filter((word) => POSITIVE_INDICATORS.has(word))));
|
|
4282
|
+
const foundNegative = Array.from(new Set(contextWords.filter((word) => NEGATIVE_INDICATORS.has(word))));
|
|
4245
4283
|
if (score >= 0.7) {
|
|
4246
|
-
|
|
4284
|
+
const details = foundPositive.length > 0 ? ` (found: ${foundPositive.join(", ")})` : "";
|
|
4285
|
+
return `Positive context detected${details} - likely not profanity`;
|
|
4247
4286
|
} else if (score <= 0.3) {
|
|
4248
|
-
|
|
4287
|
+
const details = foundNegative.length > 0 ? ` (found: ${foundNegative.join(", ")})` : "";
|
|
4288
|
+
return `Negative context detected${details} - likely profanity`;
|
|
4249
4289
|
} else {
|
|
4250
4290
|
return "Neutral context - uncertain classification";
|
|
4251
4291
|
}
|
|
@@ -4303,7 +4343,7 @@ var ContextAnalyzer = class {
|
|
|
4303
4343
|
* Updates the domain whitelist for this analyzer instance
|
|
4304
4344
|
*/
|
|
4305
4345
|
updateDomainWhitelist(newWhitelist) {
|
|
4306
|
-
this.domainWhitelists = new Set(newWhitelist);
|
|
4346
|
+
this.domainWhitelists = new Set(newWhitelist.map((word) => word.toLowerCase()));
|
|
4307
4347
|
}
|
|
4308
4348
|
/**
|
|
4309
4349
|
* Adds words to the domain whitelist
|
|
@@ -4462,6 +4502,10 @@ var HOMOGLYPHS = {
|
|
|
4462
4502
|
// Cyrillic small e
|
|
4463
4503
|
"\u0415": "E",
|
|
4464
4504
|
// Cyrillic capital E
|
|
4505
|
+
"\u043A": "k",
|
|
4506
|
+
// Cyrillic small ka
|
|
4507
|
+
"\u041A": "K",
|
|
4508
|
+
// Cyrillic capital Ka
|
|
4465
4509
|
"\u043E": "o",
|
|
4466
4510
|
// Cyrillic small o
|
|
4467
4511
|
"\u041E": "O",
|
|
@@ -4474,9 +4518,9 @@ var HOMOGLYPHS = {
|
|
|
4474
4518
|
// Cyrillic small es
|
|
4475
4519
|
"\u0421": "C",
|
|
4476
4520
|
// Cyrillic capital Es
|
|
4477
|
-
"\u0443": "
|
|
4478
|
-
// Cyrillic small u
|
|
4479
|
-
"\u0423": "
|
|
4521
|
+
"\u0443": "u",
|
|
4522
|
+
// Cyrillic small u (map to u, not y)
|
|
4523
|
+
"\u0423": "U",
|
|
4480
4524
|
// Cyrillic capital U
|
|
4481
4525
|
"\u0445": "x",
|
|
4482
4526
|
// Cyrillic small ha
|
|
@@ -4494,6 +4538,11 @@ var HOMOGLYPHS = {
|
|
|
4494
4538
|
// Cyrillic small dze
|
|
4495
4539
|
"\u0405": "S",
|
|
4496
4540
|
// Cyrillic capital Dze
|
|
4541
|
+
// Currency and special symbols that look like letters
|
|
4542
|
+
"\xA2": "c",
|
|
4543
|
+
// Cent sign
|
|
4544
|
+
"\u0192": "f",
|
|
4545
|
+
// Latin small f with hook (florin)
|
|
4497
4546
|
// Greek homoglyphs
|
|
4498
4547
|
"\u03B1": "a",
|
|
4499
4548
|
// Greek small alpha
|
|
@@ -4840,6 +4889,7 @@ var Filter = class {
|
|
|
4840
4889
|
this.cacheResults = config?.cacheResults ?? false;
|
|
4841
4890
|
this.maxCacheSize = config?.maxCacheSize ?? 1e3;
|
|
4842
4891
|
this.cache = /* @__PURE__ */ new Map();
|
|
4892
|
+
this.regexCache = /* @__PURE__ */ new Map();
|
|
4843
4893
|
let words = [];
|
|
4844
4894
|
if (config?.allLanguages) {
|
|
4845
4895
|
for (const lang in dictionary_default) {
|
|
@@ -4869,9 +4919,10 @@ var Filter = class {
|
|
|
4869
4919
|
* Applies Unicode normalization, leetspeak detection, and obfuscation handling.
|
|
4870
4920
|
*
|
|
4871
4921
|
* @param text - The input text to normalize
|
|
4922
|
+
* @param aggressive - If true, collapses to single chars (for repeated char detection)
|
|
4872
4923
|
* @returns The normalized text
|
|
4873
4924
|
*/
|
|
4874
|
-
normalizeText(text) {
|
|
4925
|
+
normalizeText(text, aggressive = false) {
|
|
4875
4926
|
let normalized = text;
|
|
4876
4927
|
if (this.normalizeUnicodeEnabled) {
|
|
4877
4928
|
normalized = normalizeUnicode(normalized);
|
|
@@ -4880,6 +4931,8 @@ var Filter = class {
|
|
|
4880
4931
|
normalized = normalizeLeetspeak(normalized, {
|
|
4881
4932
|
level: this.leetspeakLevel,
|
|
4882
4933
|
collapseRepeated: true,
|
|
4934
|
+
// Keep double letters like "ss" for normal check, collapse all for aggressive
|
|
4935
|
+
maxRepeated: aggressive ? 1 : 2,
|
|
4883
4936
|
removeSpacedChars: true
|
|
4884
4937
|
});
|
|
4885
4938
|
}
|
|
@@ -4910,6 +4963,7 @@ var Filter = class {
|
|
|
4910
4963
|
*/
|
|
4911
4964
|
clearCache() {
|
|
4912
4965
|
this.cache.clear();
|
|
4966
|
+
this.regexCache.clear();
|
|
4913
4967
|
}
|
|
4914
4968
|
/**
|
|
4915
4969
|
* Gets the current cache size.
|
|
@@ -4991,10 +5045,17 @@ var Filter = class {
|
|
|
4991
5045
|
return this.cache.get(key);
|
|
4992
5046
|
}
|
|
4993
5047
|
getRegex(word) {
|
|
5048
|
+
if (this.regexCache.has(word)) {
|
|
5049
|
+
const regex2 = this.regexCache.get(word);
|
|
5050
|
+
regex2.lastIndex = 0;
|
|
5051
|
+
return regex2;
|
|
5052
|
+
}
|
|
4994
5053
|
const flags = this.caseSensitive ? "g" : "gi";
|
|
4995
5054
|
const escapedWord = word.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
4996
5055
|
const boundary = this.wordBoundaries ? "\\b" : "";
|
|
4997
|
-
|
|
5056
|
+
const regex = new RegExp(`${boundary}${escapedWord}${boundary}`, flags);
|
|
5057
|
+
this.regexCache.set(word, regex);
|
|
5058
|
+
return regex;
|
|
4998
5059
|
}
|
|
4999
5060
|
isFuzzyToleranceMatch(word, text) {
|
|
5000
5061
|
const simplifiedText = text.toLowerCase().replace(/[^a-z]/g, "");
|
|
@@ -5012,11 +5073,12 @@ var Filter = class {
|
|
|
5012
5073
|
return score >= this.fuzzyToleranceLevel;
|
|
5013
5074
|
}
|
|
5014
5075
|
evaluateSeverity(word, text) {
|
|
5015
|
-
if (this.
|
|
5076
|
+
if (this.getRegex(word).test(text)) {
|
|
5016
5077
|
return 1 /* EXACT */;
|
|
5017
5078
|
}
|
|
5018
|
-
if (this.
|
|
5019
|
-
|
|
5079
|
+
if (!this.wordBoundaries && this.isFuzzyToleranceMatch(word, text)) {
|
|
5080
|
+
return 2 /* FUZZY */;
|
|
5081
|
+
}
|
|
5020
5082
|
return void 0;
|
|
5021
5083
|
}
|
|
5022
5084
|
/**
|
|
@@ -5036,9 +5098,20 @@ var Filter = class {
|
|
|
5036
5098
|
* ```
|
|
5037
5099
|
*/
|
|
5038
5100
|
isProfane(value) {
|
|
5039
|
-
const
|
|
5101
|
+
const originalInput = value;
|
|
5102
|
+
const normalizedInput = this.normalizeText(value);
|
|
5103
|
+
const aggressiveInput = this.normalizeText(value, true);
|
|
5040
5104
|
for (const word of this.words.keys()) {
|
|
5041
|
-
if (
|
|
5105
|
+
if (this.ignoreWords.has(word.toLowerCase())) {
|
|
5106
|
+
continue;
|
|
5107
|
+
}
|
|
5108
|
+
if (this.evaluateSeverity(word, originalInput) !== void 0) {
|
|
5109
|
+
return true;
|
|
5110
|
+
}
|
|
5111
|
+
if (this.evaluateSeverity(word, normalizedInput) !== void 0) {
|
|
5112
|
+
return true;
|
|
5113
|
+
}
|
|
5114
|
+
if (this.evaluateSeverity(word, aggressiveInput) !== void 0) {
|
|
5042
5115
|
return true;
|
|
5043
5116
|
}
|
|
5044
5117
|
}
|
|
@@ -5079,23 +5152,45 @@ var Filter = class {
|
|
|
5079
5152
|
return cachedResult;
|
|
5080
5153
|
}
|
|
5081
5154
|
if (!this.enableContextAware) {
|
|
5082
|
-
|
|
5083
|
-
|
|
5155
|
+
const originalInput = text.toLowerCase();
|
|
5156
|
+
const normalizedInput = this.normalizeText(text).toLowerCase();
|
|
5157
|
+
const aggressiveInput = this.normalizeText(text, true).toLowerCase();
|
|
5084
5158
|
const profaneWords2 = [];
|
|
5085
5159
|
const severityMap2 = {};
|
|
5086
5160
|
for (const dictWord of this.words.keys()) {
|
|
5087
5161
|
if (this.ignoreWords.has(dictWord.toLowerCase())) continue;
|
|
5088
|
-
|
|
5162
|
+
let severity = this.evaluateSeverity(dictWord, originalInput);
|
|
5089
5163
|
if (severity !== void 0) {
|
|
5090
5164
|
const regex = this.getRegex(dictWord);
|
|
5091
5165
|
let match;
|
|
5092
|
-
while ((match = regex.exec(
|
|
5166
|
+
while ((match = regex.exec(originalInput)) !== null) {
|
|
5093
5167
|
profaneWords2.push(match[0]);
|
|
5094
5168
|
if (severityMap2[match[0]] === void 0) {
|
|
5095
5169
|
severityMap2[match[0]] = severity;
|
|
5096
5170
|
}
|
|
5097
5171
|
}
|
|
5098
5172
|
}
|
|
5173
|
+
severity = this.evaluateSeverity(dictWord, normalizedInput);
|
|
5174
|
+
if (severity !== void 0) {
|
|
5175
|
+
const regex = this.getRegex(dictWord);
|
|
5176
|
+
while ((regex.exec(normalizedInput)) !== null) {
|
|
5177
|
+
if (!profaneWords2.includes(dictWord)) {
|
|
5178
|
+
profaneWords2.push(dictWord);
|
|
5179
|
+
if (severityMap2[dictWord] === void 0) {
|
|
5180
|
+
severityMap2[dictWord] = severity;
|
|
5181
|
+
}
|
|
5182
|
+
}
|
|
5183
|
+
}
|
|
5184
|
+
}
|
|
5185
|
+
severity = this.evaluateSeverity(dictWord, aggressiveInput);
|
|
5186
|
+
if (severity !== void 0) {
|
|
5187
|
+
if (!profaneWords2.includes(dictWord)) {
|
|
5188
|
+
profaneWords2.push(dictWord);
|
|
5189
|
+
if (severityMap2[dictWord] === void 0) {
|
|
5190
|
+
severityMap2[dictWord] = severity;
|
|
5191
|
+
}
|
|
5192
|
+
}
|
|
5193
|
+
}
|
|
5099
5194
|
}
|
|
5100
5195
|
let processedText2 = text;
|
|
5101
5196
|
if (this.replaceWith && profaneWords2.length > 0) {
|
|
@@ -5481,4 +5576,206 @@ var HybridFilter = class {
|
|
|
5481
5576
|
}
|
|
5482
5577
|
};
|
|
5483
5578
|
|
|
5484
|
-
|
|
5579
|
+
// src/ml/transformers.ts
|
|
5580
|
+
var RECOMMENDED_MODELS = {
|
|
5581
|
+
/** High accuracy English model (97.5%) - 67M params */
|
|
5582
|
+
pardonmyai: "tarekziade/pardonmyai",
|
|
5583
|
+
/** Smaller version for constrained environments */
|
|
5584
|
+
pardonmyaiTiny: "tarekziade/pardonmyai-tiny",
|
|
5585
|
+
/** Multilingual toxicity detection (7 languages) */
|
|
5586
|
+
toxicBert: "unitary/toxic-bert",
|
|
5587
|
+
/** Offensive speech detector (DeBERTa-based) */
|
|
5588
|
+
offensiveSpeech: "KoalaAI/OffensiveSpeechDetector"
|
|
5589
|
+
};
|
|
5590
|
+
var MODEL_PROFANE_LABELS = {
|
|
5591
|
+
"tarekziade/pardonmyai": "profane",
|
|
5592
|
+
"tarekziade/pardonmyai-tiny": "profane",
|
|
5593
|
+
"unitary/toxic-bert": "toxic",
|
|
5594
|
+
"KoalaAI/OffensiveSpeechDetector": "LABEL_1",
|
|
5595
|
+
// Offensive
|
|
5596
|
+
default: "LABEL_1"
|
|
5597
|
+
};
|
|
5598
|
+
async function getTransformers() {
|
|
5599
|
+
try {
|
|
5600
|
+
const transformers = await import('@xenova/transformers');
|
|
5601
|
+
return transformers;
|
|
5602
|
+
} catch {
|
|
5603
|
+
throw new Error(
|
|
5604
|
+
"Transformers.js is required for ML features. Install it with: npm install @xenova/transformers"
|
|
5605
|
+
);
|
|
5606
|
+
}
|
|
5607
|
+
}
|
|
5608
|
+
async function createMLChecker(config = {}) {
|
|
5609
|
+
const {
|
|
5610
|
+
model = RECOMMENDED_MODELS.pardonmyai,
|
|
5611
|
+
threshold = 0.5,
|
|
5612
|
+
profaneLabel = MODEL_PROFANE_LABELS[model] || MODEL_PROFANE_LABELS.default,
|
|
5613
|
+
quantized = true,
|
|
5614
|
+
device = "cpu"
|
|
5615
|
+
} = config;
|
|
5616
|
+
const transformers = await getTransformers();
|
|
5617
|
+
const classifier = await transformers.pipeline("text-classification", model, {
|
|
5618
|
+
quantized,
|
|
5619
|
+
device
|
|
5620
|
+
});
|
|
5621
|
+
return {
|
|
5622
|
+
/**
|
|
5623
|
+
* Check a single text for profanity
|
|
5624
|
+
*/
|
|
5625
|
+
async check(text) {
|
|
5626
|
+
const startTime = Date.now();
|
|
5627
|
+
const output = await classifier(text);
|
|
5628
|
+
const processingTimeMs = Date.now() - startTime;
|
|
5629
|
+
const profaneScore = output.find((o) => o.label === profaneLabel)?.score || 0;
|
|
5630
|
+
const containsProfanity = profaneScore >= threshold;
|
|
5631
|
+
return {
|
|
5632
|
+
containsProfanity,
|
|
5633
|
+
confidence: profaneScore,
|
|
5634
|
+
rawOutput: output,
|
|
5635
|
+
processingTimeMs
|
|
5636
|
+
};
|
|
5637
|
+
},
|
|
5638
|
+
/**
|
|
5639
|
+
* Check multiple texts
|
|
5640
|
+
*/
|
|
5641
|
+
async checkBatch(texts) {
|
|
5642
|
+
return Promise.all(texts.map((text) => this.check(text)));
|
|
5643
|
+
},
|
|
5644
|
+
/**
|
|
5645
|
+
* Get the profanity score for text (0-1)
|
|
5646
|
+
*/
|
|
5647
|
+
async getScore(text) {
|
|
5648
|
+
const result = await this.check(text);
|
|
5649
|
+
return result.confidence;
|
|
5650
|
+
},
|
|
5651
|
+
/**
|
|
5652
|
+
* Get current configuration
|
|
5653
|
+
*/
|
|
5654
|
+
getConfig() {
|
|
5655
|
+
return { model, threshold, profaneLabel, quantized, device };
|
|
5656
|
+
},
|
|
5657
|
+
/**
|
|
5658
|
+
* Dispose of the model (free memory)
|
|
5659
|
+
*/
|
|
5660
|
+
dispose() {
|
|
5661
|
+
}
|
|
5662
|
+
};
|
|
5663
|
+
}
|
|
5664
|
+
async function createHybridChecker(config = {}) {
|
|
5665
|
+
const {
|
|
5666
|
+
model = RECOMMENDED_MODELS.pardonmyai,
|
|
5667
|
+
threshold = 0.5,
|
|
5668
|
+
profaneLabel,
|
|
5669
|
+
quantized = true,
|
|
5670
|
+
device = "cpu",
|
|
5671
|
+
filterConfig = {},
|
|
5672
|
+
mlThreshold = 0.3,
|
|
5673
|
+
dictionaryWeight = 0.6,
|
|
5674
|
+
mlWeight = 0.4
|
|
5675
|
+
} = config;
|
|
5676
|
+
const filter = new Filter({
|
|
5677
|
+
languages: filterConfig.languages || ["english"],
|
|
5678
|
+
detectLeetspeak: filterConfig.detectLeetspeak ?? true,
|
|
5679
|
+
normalizeUnicode: filterConfig.normalizeUnicode ?? true,
|
|
5680
|
+
severityLevels: true,
|
|
5681
|
+
cacheResults: true,
|
|
5682
|
+
...filterConfig
|
|
5683
|
+
});
|
|
5684
|
+
let mlChecker = null;
|
|
5685
|
+
async function getMLChecker() {
|
|
5686
|
+
if (!mlChecker) {
|
|
5687
|
+
mlChecker = await createMLChecker({
|
|
5688
|
+
model,
|
|
5689
|
+
threshold,
|
|
5690
|
+
profaneLabel,
|
|
5691
|
+
quantized,
|
|
5692
|
+
device
|
|
5693
|
+
});
|
|
5694
|
+
}
|
|
5695
|
+
return mlChecker;
|
|
5696
|
+
}
|
|
5697
|
+
return {
|
|
5698
|
+
/**
|
|
5699
|
+
* Check text using hybrid approach
|
|
5700
|
+
*/
|
|
5701
|
+
async check(text) {
|
|
5702
|
+
const startTime = Date.now();
|
|
5703
|
+
const dictionaryResult = filter.checkProfanity(text);
|
|
5704
|
+
if (dictionaryResult.containsProfanity) {
|
|
5705
|
+
return {
|
|
5706
|
+
containsProfanity: true,
|
|
5707
|
+
confidence: 1,
|
|
5708
|
+
dictionaryResult,
|
|
5709
|
+
usedML: false,
|
|
5710
|
+
profaneWords: dictionaryResult.profaneWords,
|
|
5711
|
+
processingTimeMs: Date.now() - startTime
|
|
5712
|
+
};
|
|
5713
|
+
}
|
|
5714
|
+
const ml = await getMLChecker();
|
|
5715
|
+
const mlResult = await ml.check(text);
|
|
5716
|
+
const dictionaryScore = dictionaryResult.containsProfanity ? 1 : 0;
|
|
5717
|
+
const combinedScore = dictionaryScore * dictionaryWeight + mlResult.confidence * mlWeight;
|
|
5718
|
+
const containsProfanity = combinedScore >= mlThreshold;
|
|
5719
|
+
return {
|
|
5720
|
+
containsProfanity,
|
|
5721
|
+
confidence: combinedScore,
|
|
5722
|
+
dictionaryResult,
|
|
5723
|
+
mlResult,
|
|
5724
|
+
usedML: true,
|
|
5725
|
+
profaneWords: dictionaryResult.profaneWords,
|
|
5726
|
+
processingTimeMs: Date.now() - startTime
|
|
5727
|
+
};
|
|
5728
|
+
},
|
|
5729
|
+
/**
|
|
5730
|
+
* Check multiple texts
|
|
5731
|
+
*/
|
|
5732
|
+
async checkBatch(texts) {
|
|
5733
|
+
return Promise.all(texts.map((text) => this.check(text)));
|
|
5734
|
+
},
|
|
5735
|
+
/**
|
|
5736
|
+
* Dictionary-only check (fast, no ML)
|
|
5737
|
+
*/
|
|
5738
|
+
checkFast(text) {
|
|
5739
|
+
return filter.checkProfanity(text);
|
|
5740
|
+
},
|
|
5741
|
+
/**
|
|
5742
|
+
* ML-only check (slower, more accurate)
|
|
5743
|
+
*/
|
|
5744
|
+
async checkML(text) {
|
|
5745
|
+
const ml = await getMLChecker();
|
|
5746
|
+
return ml.check(text);
|
|
5747
|
+
},
|
|
5748
|
+
/**
|
|
5749
|
+
* Get the underlying filter
|
|
5750
|
+
*/
|
|
5751
|
+
getFilter() {
|
|
5752
|
+
return filter;
|
|
5753
|
+
},
|
|
5754
|
+
/**
|
|
5755
|
+
* Dispose of resources
|
|
5756
|
+
*/
|
|
5757
|
+
async dispose() {
|
|
5758
|
+
if (mlChecker) {
|
|
5759
|
+
mlChecker.dispose();
|
|
5760
|
+
mlChecker = null;
|
|
5761
|
+
}
|
|
5762
|
+
}
|
|
5763
|
+
};
|
|
5764
|
+
}
|
|
5765
|
+
async function isTransformersAvailable() {
|
|
5766
|
+
try {
|
|
5767
|
+
await getTransformers();
|
|
5768
|
+
return true;
|
|
5769
|
+
} catch {
|
|
5770
|
+
return false;
|
|
5771
|
+
}
|
|
5772
|
+
}
|
|
5773
|
+
async function preloadModel(model = RECOMMENDED_MODELS.pardonmyai, options = {}) {
|
|
5774
|
+
const { quantized = true } = options;
|
|
5775
|
+
const transformers = await getTransformers();
|
|
5776
|
+
await transformers.pipeline("text-classification", model, {
|
|
5777
|
+
quantized
|
|
5778
|
+
});
|
|
5779
|
+
}
|
|
5780
|
+
|
|
5781
|
+
export { HybridFilter, RECOMMENDED_MODELS, ToxicityDetector, createHybridChecker, createMLChecker, isTransformersAvailable, preloadModel };
|