purism 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
purism-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Siu Kim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
purism-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: purism
3
+ Version: 1.0.0
4
+ Summary: Automatic data filtering library specialized in Korean data purification
5
+ Author: Lumia101
6
+ Project-URL: Homepage, https://github.com/Lumia101/purism/tree/main
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: torch
11
+ Requires-Dist: transformers
12
+ Requires-Dist: bitsandbytes
13
+ Requires-Dist: lingua-language-detector
14
+ Requires-Dist: datasketch[redis]
15
+ Requires-Dist: ftfy
16
+ Dynamic: license-file
17
+
18
+ # Purism: Automatic data filtering library specialized in Korean data purification
19
+ > **Puri**fy **s**yste**m**
20
+
21
+ ## Summary
22
+ This repository is an automatic data filtering library specialized in Korean data purification.
23
+
24
+ # Quickstart
25
+ Coming soon...
26
+
27
+ ## API
purism-1.0.0/README.md ADDED
@@ -0,0 +1,10 @@
1
+ # Purism: Automatic data filtering library specialized in Korean data purification
2
+ > **Puri**fy **s**yste**m**
3
+
4
+ ## Summary
5
+ This repository is an automatic data filtering library specialized in Korean data purification.
6
+
7
+ # Quickstart
8
+ Coming soon...
9
+
10
+ ## API
@@ -0,0 +1,9 @@
1
+ from .pipeline import PurifyConfig
2
+ from .normalizers import UnicodeCleaner, UICleaner, TextCleaner
3
+ from .filters import LengthFilter, HarmfulWordsFilter, SpamWordsFilter, SignAbuseFilter, PIIFilter, LanguageFilter, DedupFilter, PPLFilter
4
+
5
+ __all__ = [
6
+ "PurifyConfig", "UnicodeCleaner", "UICleaner", "TextCleaner",
7
+ "LengthFilter", "HarmfulWordsFilter", "SpamWordsFilter",
8
+ "SignAbuseFilter", "PIIFilter", "LanguageFilter", "DedupFilter", "PPLFilter"
9
+ ]
@@ -0,0 +1,8 @@
1
+ from .simple_filter import LengthFilter, HarmfulWordsFilter, SpamWordsFilter, SignAbuseFilter, PIIFilter
2
+ from .advanced_filter import LanguageFilter, DedupFilter
3
+ from .model_filter import PPLFilter
4
+
5
+ __all__ = [
6
+ "LengthFilter", "HarmfulWordsFilter", "SpamWordsFilter", "SignAbuseFilter",
7
+ "PIIFilter", "LanguageFilter", "DedupFilter", "PPLFilter"
8
+ ]
@@ -0,0 +1,80 @@
1
+ # Load libraries
2
+ import hashlib
3
+ from lingua import Language, LanguageDetectorBuilder
4
+ from datasketch import MinHash, MinHashLSH
5
+ from .base_filter import BaseFilter
6
+
7
+ # Use the lingua library to remove non-Korean sentences
8
+ class LanguageFilter(BaseFilter):
9
+ def __init__(self, threshold=0.6):
10
+ self.judge = LanguageDetectorBuilder.from_languages(
11
+ Language.KOREAN,
12
+ Language.JAPANESE,
13
+ Language.CHINESE,
14
+ Language.ENGLISH
15
+ ).build()
16
+ self.threshold = threshold
17
+
18
+ def apply(self, text: str):
19
+ match = self.judge.compute_language_confidence(text, Language.KOREAN)
20
+
21
+ if match is None:
22
+ return False
23
+
24
+ if match >= self.threshold:
25
+ return True
26
+ else:
27
+ return False
28
+
29
+ # Remove duplicate content
30
+ class DedupFilter(BaseFilter):
31
+ def __init__(self, threshold=0.7, num_perm=128, shingles=3):
32
+ self.threshold = threshold
33
+ self.num_perm = num_perm
34
+ self.shingle = shingles
35
+ self.lsh = MinHashLSH(
36
+ threshold=self.threshold,
37
+ num_perm=self.num_perm
38
+ )
39
+
40
+ self.exact_hashes = set()
41
+ self.count = 0
42
+
43
+ def get_minhash(self, text):
44
+ m = MinHash(num_perm=self.num_perm)
45
+ if len(text) < self.shingle:
46
+ m.update(text.encode('utf8'))
47
+ return m
48
+
49
+ for i in range(len(text) - self.shingle + 1):
50
+ token = text[i:i+self.shingle]
51
+ m.update(token.encode("utf8"))
52
+
53
+ return m
54
+
55
+ def apply(self, text: str):
56
+ # Remove a sentence with nothing
57
+ text = text.strip()
58
+ if not text:
59
+ return False
60
+
61
+ # Exact Dedup
62
+ exact_hash = hashlib.sha256(
63
+ text.encode("utf8")
64
+ ).hexdigest()
65
+
66
+ if exact_hash in self.exact_hashes:
67
+ return False
68
+
69
+ self.exact_hashes.add(exact_hash)
70
+
71
+ # Near Dedup
72
+ m = self.get_minhash(text)
73
+ result = self.lsh.query(m)
74
+
75
+ if result:
76
+ return False
77
+
78
+ self.count += 1
79
+ self.lsh.insert(str(self.count), m)
80
+ return True
@@ -0,0 +1,6 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ class BaseFilter(ABC):
4
+ @abstractmethod
5
+ def apply(self, text: str) -> bool:
6
+ pass
@@ -0,0 +1,55 @@
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
+ from .base_filter import BaseFilter
4
+
5
+ class PPLFilter(BaseFilter):
6
+ def __init__(self, ppl_threshold=180.0):
7
+ model_id = "LiquidAI/LFM2.5-1.2B-Instruct"
8
+ self.quant_config = BitsAndBytesConfig(
9
+ load_in_4bit=True,
10
+ bnb_4bit_quant_type="nf4",
11
+ bnb_4bit_use_double_quant=True
12
+ )
13
+ self.model = AutoModelForCausalLM.from_pretrained(
14
+ model_id,
15
+ device_map="auto",
16
+ dtype="auto",
17
+ quantization_config=self.quant_config
18
+ )
19
+ self.model.eval()
20
+
21
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id)
22
+ self.drop_ratio = ppl_threshold
23
+
24
+ def compute_ppl(self, text: str):
25
+ device = next(
26
+ self.model.parameters()
27
+ ).device
28
+ enc = self.tokenizer(
29
+ text,
30
+ return_tensors="pt",
31
+ truncation=True,
32
+ max_length=512
33
+ ).to(device)
34
+
35
+ with torch.no_grad():
36
+ output = self.model(
37
+ **enc,
38
+ labels=enc["input_ids"]
39
+ )
40
+ loss = output.loss
41
+
42
+ ppl = torch.exp(loss)
43
+
44
+ if torch.isinf(ppl):
45
+ return float("inf")
46
+
47
+ return ppl.item()
48
+
49
+ def apply(self, text: str):
50
+ text = text.strip()
51
+ if len(text) < 10:
52
+ return False
53
+
54
+ ppl = self.compute_ppl(text)
55
+ return ppl <= self.drop_ratio
@@ -0,0 +1,116 @@
1
+ # Load libraries
2
+ import re
3
+ from .base_filter import BaseFilter
4
+ from importlib.resources import files
5
+
6
+ # If the length is too long or too short, remove it
7
+ class LengthFilter(BaseFilter):
8
+ def __init__(self, min_len=50, max_len=10000):
9
+ self.min_len = min_len
10
+ self.max_len = max_len
11
+
12
+ def apply(self, text: str):
13
+ actual_len = len(text.strip()) # Remove the blank
14
+ if self.min_len <= actual_len <= self.max_len:
15
+ return True
16
+ return False
17
+
18
+ # Remove any inappropriate words
19
+ # You can customize settings by editing the .txt file.
20
+ class HarmfulWordsFilter(BaseFilter):
21
+ def __init__(self, threshold=5):
22
+ self.filepath = files("purism.resources").joinpath(
23
+ "harmful_words.txt"
24
+ )
25
+ self.pattern = self.load_and_compile()
26
+ self.threshold = threshold
27
+
28
+ def load_and_compile(self):
29
+ # Open .txt file which contains harmful words
30
+ with open(self.filepath, 'r', encoding='utf-8') as f:
31
+ words = sorted(list(set(line.strip() for line in f if line.strip())), key=len, reverse=True)
32
+
33
+ # If the file is empty, stop running.
34
+ if not words:
35
+ raise ValueError("It seems the list of forbidden words is empty.")
36
+
37
+ combined_pattern = '|'.join(map(re.escape, words))
38
+ return re.compile(combined_pattern)
39
+
40
+ def apply(self, text: str):
41
+ if not self.pattern:
42
+ return False
43
+
44
+ # Detects and stops only a set number of times for speed
45
+ count = 0
46
+ for _ in self.pattern.finditer(text):
47
+ count += 1
48
+ if count >= self.threshold:
49
+ return False
50
+ return True
51
+
52
+ class SpamWordsFilter(BaseFilter):
53
+ def __init__(self, threshold=8):
54
+ self.filepath = files("purism.resources").joinpath(
55
+ "spam_words.txt"
56
+ )
57
+ self.pattern = self.load_and_compile()
58
+ self.threshold = threshold
59
+
60
+ def load_and_compile(self):
61
+ # Open .txt file which contains harmful words
62
+ with open(self.filepath, 'r', encoding='utf-8') as f:
63
+ words = sorted(list(set(line.strip() for line in f if line.strip())), key=len, reverse=True)
64
+
65
+ # If the file is empty, stop running.
66
+ if not words:
67
+ raise ValueError("It seems the list of forbidden words is empty.")
68
+
69
+ combined_pattern = '|'.join(map(re.escape, words))
70
+ return re.compile(combined_pattern)
71
+
72
+ def apply(self, text: str):
73
+ if not self.pattern:
74
+ return False
75
+
76
+ # Detects and stops only a set number of times for speed
77
+ count = 0
78
+ for _ in self.pattern.finditer(text):
79
+ count += 1
80
+ if count >= self.threshold:
81
+ return False
82
+ return True
83
+
84
+ # If text have used too many symbols, remove it
85
+ class SignAbuseFilter(BaseFilter):
86
+ def __init__(self, threshold=0.3):
87
+ self.threshold = threshold
88
+ self.signabuse = re.compile(r'[^a-zA-Z0-9가-힣\s]')
89
+
90
+ def apply(self, text: str):
91
+ len_all = len(text)
92
+ len_sign = len(self.signabuse.findall(text))
93
+
94
+ if len_all == 0:
95
+ return False
96
+ if len_sign / len_all >= self.threshold:
97
+ return False
98
+ return True
99
+
100
+ # If personal information is included, remove it
101
+ class PIIFilter(BaseFilter):
102
+ def __init__(self):
103
+ # Compiling key forms of personal information
104
+ self.pii_patterns = {
105
+ "resident_number": re.compile(r'\d{2}([01]\d[0123]\d)-?[1-4]\d{6}'),
106
+ "phone_number": re.compile(r'01[016789]-?\d{3,4}-?\d{4}'),
107
+ "email": re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
108
+ "card_number": re.compile(r'(?:\d{4}[- ]?){3}\d{4}')
109
+ }
110
+
111
+ def apply(self, text: str):
112
+ # Return False immediately if any personal information is found
113
+ for name, pattern in self.pii_patterns.items():
114
+ if pattern.search(text):
115
+ return False
116
+ return True
@@ -0,0 +1,5 @@
1
+ from .normalizer import UnicodeCleaner, UICleaner, TextCleaner
2
+
3
+ __all__ = [
4
+ "UnicodeCleaner", "UICleaner", "TextCleaner"
5
+ ]
@@ -0,0 +1,6 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ class BaseNormalizer(ABC):
4
+ @abstractmethod
5
+ def normalize(self, text: str) -> str:
6
+ pass
@@ -0,0 +1,54 @@
1
+ # Load libraries
2
+ from .base_normalizer import BaseNormalizer
3
+ import re
4
+ import unicodedata
5
+ import html
6
+ from ftfy import fix_text
7
+
8
+ # Unicode-based text normalization
9
+ class UnicodeCleaner(BaseNormalizer):
10
+ def __init__(self, type="NFC"):
11
+ method = ["NFC", "NFD", "NFKC", "NFKD"]
12
+ if type in method:
13
+ self.unicode_type = type
14
+ else:
15
+ raise ValueError("Invalid Unicode Normalization method.")
16
+
17
+ def normalize(self, text: str) -> str:
18
+ if not text:
19
+ return ""
20
+
21
+ cleaned_text = unicodedata.normalize(self.unicode_type, text)
22
+ return cleaned_text
23
+
24
+ # Remove useless sentences such as HTML tags
25
+ class UICleaner(BaseNormalizer):
26
+ def __init__(self):
27
+ self.html_tag_re = re.compile(r'<[^>]+>')
28
+ self.control_char_re = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
29
+
30
+ def normalize(self, text: str) -> str:
31
+ if not text:
32
+ return ""
33
+
34
+ text = html.unescape(text)
35
+ text = self.html_tag_re.sub(" ", text)
36
+ text = self.control_char_re.sub("", text)
37
+
38
+ return text
39
+
40
+ # Reduce too many identical characters
41
+ class TextCleaner(BaseNormalizer):
42
+ def __init__(self):
43
+ self.whitespace_re = re.compile(r'\s+')
44
+ self.repeat_re = re.compile(r"(ㅋ|ㅎ|ㅠ|ㅜ|!|\.)\1{2,}")
45
+
46
+ def normalize(self, text: str) -> str:
47
+ if not text:
48
+ return ""
49
+
50
+ text = fix_text(text)
51
+ text = self.repeat_re.sub(r"\1\1", text)
52
+ text = self.whitespace_re.sub(" ", text).strip()
53
+
54
+ return text
@@ -0,0 +1,5 @@
1
+ from .pipeline import PurifyConfig
2
+
3
+ __all__ = [
4
+ "PurifyConfig"
5
+ ]
@@ -0,0 +1,31 @@
1
+ # Import all created Python codes from this repository
2
+ from purism.normalizers.normalizer import TextCleaner, UICleaner, UnicodeCleaner
3
+ from purism.filters.simple_filter import LengthFilter, HarmfulWordsFilter, SpamWordsFilter, SignAbuseFilter, PIIFilter
4
+ from purism.filters.advanced_filter import LanguageFilter
5
+ from purism.filters.model_filter import PPLFilter
6
+
7
+ # Setting Settings for Data Purification
8
+ class PurifyConfig():
9
+ def __init__(self, filters, normalizer):
10
+ self.normalizer = normalizer
11
+ self.filters = filters
12
+
13
+ def purify(self, text: str):
14
+ for normalizer in self.normalizer:
15
+ text_cleaned = normalizer.normalize(text)
16
+
17
+ for filter in self.filters:
18
+ if not filter.apply(text_cleaned):
19
+ return {
20
+ "raw_text": text,
21
+ "passed": False,
22
+ "filtered_by": filter.__class__.__name__,
23
+ "normalized_text": text_cleaned
24
+ }
25
+
26
+ return {
27
+ "raw_text": text,
28
+ "passed": True,
29
+ "filtered_by": None,
30
+ "normalized_text": text_cleaned
31
+ }
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,108 @@
1
+ 강간
2
+ 개새끼
3
+ 개자식
4
+ 개좆
5
+ 개차반
6
+ 거유
7
+ 계집년
8
+ 고자
9
+ 근친
10
+ 노모
11
+ 니기미
12
+ 뒤질래
13
+ 딸딸이
14
+ 때씹
15
+ 또라이
16
+ 뙤놈
17
+ 로리타
18
+ 망가
19
+ 몰카
20
+ 미친
21
+ 바바리맨
22
+ 변태
23
+ 병신
24
+ 보지
25
+ 불알
26
+ 빠구리
27
+ 사까시
28
+ 섹스
29
+ 스와핑
30
+ 쌍놈
31
+ 씨발
32
+ 씨발놈
33
+ 씨팔
34
+
35
+ 씹물
36
+ 씹빨
37
+ 씹새끼
38
+ 씹알
39
+ 씹창
40
+ 씹팔
41
+ 암캐
42
+ 애자
43
+ 야동
44
+ 야사
45
+ 야애니
46
+ 엄창
47
+ 에로
48
+ 염병
49
+ 옘병
50
+ 유모
51
+ 육갑
52
+ 은꼴
53
+ 자위
54
+ 자지
55
+ 잡년
56
+ 종간나
57
+
58
+ 좆만
59
+ 죽일년
60
+ 쥐좆
61
+ 직촬
62
+ 짱깨
63
+ 쪽바리
64
+ 창녀
65
+ 포르노
66
+ 하드코어
67
+ 호로
68
+ 화냥년
69
+ 후레아들
70
+ 후장
71
+ 희쭈그리
72
+ 바카라
73
+ 바다이야기
74
+ 카지노
75
+ 출장안마
76
+ 출장마사지
77
+ 출장맛사지
78
+ 안마방
79
+ 콜걸
80
+ 바둑이
81
+ 황금성
82
+ 홀덤바
83
+ 빠징코
84
+ 엉밑살
85
+ ㅗㅜㅑ
86
+ ㅅㅅ
87
+ ㅅㅂ
88
+ ㅂㅅ
89
+ ㄱㅅㄲ
90
+ 좆까
91
+ ㅈ까
92
+ 조까
93
+ 음란마귀
94
+ 토렌트
95
+ 대딸방
96
+ 급딸
97
+ 오르가즘
98
+ 포커
99
+ 슬롯머신
100
+ 토토
101
+ 메갈
102
+ 한남
103
+ 전라디언
104
+ 일베
105
+ 성인용품
106
+ 일베충
107
+ 느금마
108
+ 니엄마
@@ -0,0 +1,30 @@
1
+ 문의
2
+ 전화문의
3
+ 텔레그램
4
+ DM
5
+ 인스타그램
6
+ 디엠
7
+ 카톡
8
+ 클릭
9
+ 바로가기
10
+ 댓글
11
+ 답글
12
+ 추천
13
+ 비추천
14
+ 로그인
15
+ 로그아웃
16
+ 회원가입
17
+ 이메일문의
18
+ 간편로그인
19
+ 소셜로그인
20
+ 본인인증
21
+ 장바구니
22
+ 배송조회
23
+ 이전글
24
+ 다음글
25
+ 메뉴
26
+ 카테고리
27
+ 공지사항
28
+ 닫기
29
+ 사이트맵
30
+
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: purism
3
+ Version: 1.0.0
4
+ Summary: Automatic data filtering library specialized in Korean data purification
5
+ Author: Lumia101
6
+ Project-URL: Homepage, https://github.com/Lumia101/purism/tree/main
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: torch
11
+ Requires-Dist: transformers
12
+ Requires-Dist: bitsandbytes
13
+ Requires-Dist: lingua-language-detector
14
+ Requires-Dist: datasketch[redis]
15
+ Requires-Dist: ftfy
16
+ Dynamic: license-file
17
+
18
+ # Purism: Automatic data filtering library specialized in Korean data purification
19
+ > **Puri**fy **s**yste**m**
20
+
21
+ ## Summary
22
+ This repository is an automatic data filtering library specialized in Korean data purification.
23
+
24
+ # Quickstart
25
+ Coming soon...
26
+
27
+ ## API
@@ -0,0 +1,22 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ purism/__init__.py
5
+ purism.egg-info/PKG-INFO
6
+ purism.egg-info/SOURCES.txt
7
+ purism.egg-info/dependency_links.txt
8
+ purism.egg-info/requires.txt
9
+ purism.egg-info/top_level.txt
10
+ purism/filters/__init__.py
11
+ purism/filters/advanced_filter.py
12
+ purism/filters/base_filter.py
13
+ purism/filters/model_filter.py
14
+ purism/filters/simple_filter.py
15
+ purism/normalizers/__init__.py
16
+ purism/normalizers/base_normalizer.py
17
+ purism/normalizers/normalizer.py
18
+ purism/pipeline/__init__.py
19
+ purism/pipeline/pipeline.py
20
+ purism/resources/__init__.py
21
+ purism/resources/harmful_words.txt
22
+ purism/resources/spam_words.txt
@@ -0,0 +1,6 @@
1
+ torch
2
+ transformers
3
+ bitsandbytes
4
+ lingua-language-detector
5
+ datasketch[redis]
6
+ ftfy
@@ -0,0 +1,2 @@
1
+ dist
2
+ purism
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=82.0.1"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "purism"
7
+ version = "1.0.0"
8
+ description = "Automatic data filtering library specialized in Korean data purification"
9
+ authors = [
10
+ {name="Lumia101"}
11
+ ]
12
+ readme = "README.md"
13
+ requires-python = ">=3.11"
14
+ dependencies = [
15
+ "torch",
16
+ "transformers",
17
+ "bitsandbytes",
18
+ "lingua-language-detector",
19
+ "datasketch[redis]",
20
+ "ftfy"
21
+ ]
22
+
23
+ [project.urls]
24
+ "Homepage" = "https://github.com/Lumia101/purism/tree/main"
25
+
26
+ [tool.setuptools.packages.find]
27
+ where = ["."]
28
+
29
+ [tool.setuptools.package-data]
30
+ "purism.resources" = ["*.txt"]
purism-1.0.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+