korcen 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
korcen/korcen.py
CHANGED
@@ -137,7 +137,7 @@ FALSE_POSITIVE_PATTERNS_GENERAL = [
|
|
137
137
|
'줫습니다', '줫음', '줫잖아', '줫겠지', '쫒아', '쫒는', '쫒기다', '쫒기라', '쫒기로',
|
138
138
|
'쫒기를', '쫒기며', '쫒기는', '쫒기나', '쫒겨', '쫒겻', '쫒겼', '쫒았', '쫒다', '쫒고',
|
139
139
|
'줫는', '줫어', '줬는', '줫군', '줬다', '줬어', '천조', '쫒기', '해줫더니', '줫다', '내쫒은',
|
140
|
-
'내쫒다', '좇아',
|
140
|
+
'내쫒다', '좇아', "날개",
|
141
141
|
'ㅡ'
|
142
142
|
]
|
143
143
|
FALSE_POSITIVE_PATTERNS_MINOR = [
|
@@ -162,7 +162,7 @@ FALSE_POSITIVE_PATTERNS_SEXUAL = [
|
|
162
162
|
'빨간색', '초록색', '보라색', '청색', '핑크색', '남색', '검은색', '하양색', '주황색', '연두색',
|
163
163
|
'스공', '스시', '스키장', '스킨', '스킬', '스틸', '스탑', '스트레스', '해야', '카시야스', '야스톤', '유니섹스', '스튜디오',
|
164
164
|
'위대한', '소유자', '작업자', '자기위로', '위대하지', '암살자', '학자',
|
165
|
-
'freenude',
|
165
|
+
'freenude', "상자"
|
166
166
|
]
|
167
167
|
FALSE_POSITIVE_PATTERNS_BELITTLE = [
|
168
168
|
'려운지', '무서운지', '라운지', '운지법', '싸운지', '운지버섯', '운지린다', '깔보다', '깔보시',
|
@@ -356,7 +356,7 @@ def build_flexible_regex(pattern_in_processed_text: str):
|
|
356
356
|
|
357
357
|
reverse_multi_map = {}
|
358
358
|
for k, v in MULTI_CHAR_REPLACEMENTS.items():
|
359
|
-
|
359
|
+
reverse_multi_map.setdefault(v.lower(), set()).add(k)
|
360
360
|
|
361
361
|
for char in pattern_in_processed_text:
|
362
362
|
char_lower = char.lower()
|
@@ -456,7 +456,7 @@ def check_and_report_profanity_pattern(text: str, level: str = 'general'):
|
|
456
456
|
if level == 'english' and BETTER_PROFANITY_LOADED:
|
457
457
|
fp_regex_english = get_false_positive_regex('english')
|
458
458
|
if fp_regex_english:
|
459
|
-
|
459
|
+
text_for_better_profanity = fp_regex_english.sub('', text_for_better_profanity)
|
460
460
|
|
461
461
|
text_for_better_profanity = text_for_better_profanity.replace("*", "")
|
462
462
|
|
@@ -467,12 +467,12 @@ def check_and_report_profanity_pattern(text: str, level: str = 'general'):
|
|
467
467
|
censored_words_in_segment = re.findall(r'\b\w*▩+\w*\b', censored_text.lower())
|
468
468
|
|
469
469
|
if censored_words_in_segment:
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
470
|
+
original_words_split = text_for_better_profanity.split()
|
471
|
+
censored_words_split = censored_text.split()
|
472
|
+
for ow, cw in zip(original_words_split, censored_words_split):
|
473
|
+
if '▩' in cw:
|
474
|
+
detected_word_approx = ow
|
475
|
+
break
|
476
476
|
|
477
477
|
if detected_word_approx:
|
478
478
|
normalized_detected_word = normalize_for_custom_comparison(detected_word_approx)
|
@@ -550,7 +550,7 @@ def highlight_profanity(text: str, id: int = None, level: str = 'general', highl
|
|
550
550
|
|
551
551
|
if level.lower() == 'all':
|
552
552
|
levels = ['general', 'minor', 'sexual', 'belittle', 'race', 'parent',
|
553
|
-
|
553
|
+
'special', 'politics']
|
554
554
|
|
555
555
|
if BETTER_PROFANITY_LOADED:
|
556
556
|
levels.append('english')
|
@@ -625,7 +625,7 @@ def check_and_report_profanity_pattern(text: str, level: str = 'general'):
|
|
625
625
|
return include_match.group(0)
|
626
626
|
|
627
627
|
if processed_text in EXACT_MATCH_PROFANITY and level == 'general':
|
628
|
-
|
628
|
+
pass
|
629
629
|
|
630
630
|
fp_regex = get_false_positive_regex(level)
|
631
631
|
text_without_false_positives = fp_regex.sub('', processed_text) if fp_regex else processed_text
|
@@ -641,14 +641,14 @@ def check_and_report_profanity_pattern(text: str, level: str = 'general'):
|
|
641
641
|
detected_word = None
|
642
642
|
for ow, cw in zip(original_words, censored_words):
|
643
643
|
if '▩' in cw:
|
644
|
-
|
645
|
-
|
644
|
+
detected_word = ow.lower()
|
645
|
+
break
|
646
646
|
|
647
647
|
if detected_word:
|
648
648
|
normalized_detected_word = normalize_for_custom_comparison(detected_word)
|
649
649
|
for exclude_regex in CUSTOM_EXCLUDE_REGEXES:
|
650
|
-
|
651
|
-
|
650
|
+
if exclude_regex.fullmatch(normalized_detected_word):
|
651
|
+
return None
|
652
652
|
return detected_word
|
653
653
|
|
654
654
|
fallback_pattern = "english_profanity_detected"
|
@@ -678,7 +678,7 @@ def check_and_report_profanity_pattern(text: str, level: str = 'general'):
|
|
678
678
|
normalized_detected_profanity = normalize_for_custom_comparison(detected_profanity_string)
|
679
679
|
for exclude_regex in CUSTOM_EXCLUDE_REGEXES:
|
680
680
|
if exclude_regex.fullmatch(normalized_detected_profanity):
|
681
|
-
|
681
|
+
return None
|
682
682
|
|
683
683
|
return detected_profanity_string
|
684
684
|
|
@@ -686,7 +686,7 @@ def check_and_report_profanity_pattern(text: str, level: str = 'general'):
|
|
686
686
|
normalized_processed_text = normalize_for_custom_comparison(processed_text)
|
687
687
|
for exclude_regex in CUSTOM_EXCLUDE_REGEXES:
|
688
688
|
if exclude_regex.fullmatch(normalized_processed_text):
|
689
|
-
|
689
|
+
return None
|
690
690
|
return processed_text
|
691
691
|
|
692
692
|
return None
|
@@ -5,14 +5,14 @@ korcen/chinese.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
korcen/english.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
korcen/general.py,sha256=mGX9Sw-z9nT0hNYoPOPxD68dIi0nlhRnfqnUxLF3rLI,25595
|
7
7
|
korcen/japanese.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
korcen/korcen.py,sha256=
|
8
|
+
korcen/korcen.py,sha256=7xJSNNGNXro0IsSZMByXC0GQq23NaZDTVjbL405P_kU,64220
|
9
9
|
korcen/minor.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
korcen/parent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
korcen/politics.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
korcen/race.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
korcen/sexual.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
14
|
korcen/special.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
korcen-1.0.
|
16
|
-
korcen-1.0.
|
17
|
-
korcen-1.0.
|
18
|
-
korcen-1.0.
|
15
|
+
korcen-1.0.1.dist-info/METADATA,sha256=-4z5cBCvoxvDu-mUwI89x-1G6eF0XLntFA1hyPEbyMI,465
|
16
|
+
korcen-1.0.1.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
17
|
+
korcen-1.0.1.dist-info/top_level.txt,sha256=2LtNKXroHMOGgMokwiUPWwsJt5kmJ7yhkZ4bmtROZ5c,7
|
18
|
+
korcen-1.0.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|