PyPI - purism - Versions diffs - 1.0.0__tar.gz - Mend

purism 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

purism-1.0.0/LICENSE +21 -0
purism-1.0.0/PKG-INFO +27 -0
purism-1.0.0/README.md +10 -0
purism-1.0.0/purism/__init__.py +9 -0
purism-1.0.0/purism/filters/__init__.py +8 -0
purism-1.0.0/purism/filters/advanced_filter.py +80 -0
purism-1.0.0/purism/filters/base_filter.py +6 -0
purism-1.0.0/purism/filters/model_filter.py +55 -0
purism-1.0.0/purism/filters/simple_filter.py +116 -0
purism-1.0.0/purism/normalizers/__init__.py +5 -0
purism-1.0.0/purism/normalizers/base_normalizer.py +6 -0
purism-1.0.0/purism/normalizers/normalizer.py +54 -0
purism-1.0.0/purism/pipeline/__init__.py +5 -0
purism-1.0.0/purism/pipeline/pipeline.py +31 -0
purism-1.0.0/purism/resources/__init__.py +1 -0
purism-1.0.0/purism/resources/harmful_words.txt +108 -0
purism-1.0.0/purism/resources/spam_words.txt +30 -0
purism-1.0.0/purism.egg-info/PKG-INFO +27 -0
purism-1.0.0/purism.egg-info/SOURCES.txt +22 -0
purism-1.0.0/purism.egg-info/dependency_links.txt +1 -0
purism-1.0.0/purism.egg-info/requires.txt +6 -0
purism-1.0.0/purism.egg-info/top_level.txt +2 -0
purism-1.0.0/pyproject.toml +30 -0
purism-1.0.0/setup.cfg +4 -0

purism-1.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Siu Kim
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

purism-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,27 @@
+Metadata-Version: 2.4
+Name: purism
+Version: 1.0.0
+Summary: Automatic data filtering library specialized in Korean data purification
+Author: Lumia101
+Project-URL: Homepage, https://github.com/Lumia101/purism/tree/main
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch
+Requires-Dist: transformers
+Requires-Dist: bitsandbytes
+Requires-Dist: lingua-language-detector
+Requires-Dist: datasketch[redis]
+Requires-Dist: ftfy
+Dynamic: license-file
+# Purism: Automatic data filtering library specialized in Korean data purification
+> **Puri**fy **s**yste**m**
+## Summary
+This repository is an automatic data filtering library specialized in Korean data purification.
+# Quickstart
+Coming soon...
+## API

purism-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,10 @@
+# Purism: Automatic data filtering library specialized in Korean data purification
+> **Puri**fy **s**yste**m**
+## Summary
+This repository is an automatic data filtering library specialized in Korean data purification.
+# Quickstart
+Coming soon...
+## API

purism-1.0.0/purism/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .pipeline import PurifyConfig
+from .normalizers import UnicodeCleaner, UICleaner, TextCleaner
+from .filters import LengthFilter, HarmfulWordsFilter, SpamWordsFilter, SignAbuseFilter, PIIFilter, LanguageFilter, DedupFilter, PPLFilter
+__all__ = [
+    "PurifyConfig", "UnicodeCleaner", "UICleaner", "TextCleaner",
+    "LengthFilter", "HarmfulWordsFilter", "SpamWordsFilter",
+    "SignAbuseFilter", "PIIFilter", "LanguageFilter", "DedupFilter", "PPLFilter"
+]

purism-1.0.0/purism/filters/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from .simple_filter import LengthFilter, HarmfulWordsFilter, SpamWordsFilter, SignAbuseFilter, PIIFilter
+from .advanced_filter import LanguageFilter, DedupFilter
+from .model_filter import PPLFilter
+__all__ = [
+    "LengthFilter", "HarmfulWordsFilter", "SpamWordsFilter", "SignAbuseFilter",
+    "PIIFilter", "LanguageFilter", "DedupFilter", "PPLFilter"
+]

purism-1.0.0/purism/filters/advanced_filter.py ADDED Viewed

@@ -0,0 +1,80 @@
+# Load libraries
+import hashlib
+from lingua import Language, LanguageDetectorBuilder
+from datasketch import MinHash, MinHashLSH
+from .base_filter import BaseFilter
+# Use the lingua library to remove non-Korean sentences
+class LanguageFilter(BaseFilter):
+    def __init__(self, threshold=0.6):
+        self.judge = LanguageDetectorBuilder.from_languages(
+            Language.KOREAN,
+            Language.JAPANESE,
+            Language.CHINESE,
+            Language.ENGLISH
+        ).build()
+        self.threshold = threshold
+    def apply(self, text: str):
+        match = self.judge.compute_language_confidence(text, Language.KOREAN)
+        if match is None:
+            return False
+        if match >= self.threshold:
+            return True
+        else:
+            return False
+# Remove duplicate content
+class DedupFilter(BaseFilter):
+    def __init__(self, threshold=0.7, num_perm=128, shingles=3):
+        self.threshold = threshold
+        self.num_perm = num_perm
+        self.shingle = shingles
+        self.lsh = MinHashLSH(
+            threshold=self.threshold,
+            num_perm=self.num_perm
+        )
+        self.exact_hashes = set()
+        self.count = 0
+    def get_minhash(self, text):
+        m = MinHash(num_perm=self.num_perm)
+        if len(text) < self.shingle:
+            m.update(text.encode('utf8'))
+            return m
+        for i in range(len(text) - self.shingle + 1):
+            token = text[i:i+self.shingle]
+            m.update(token.encode("utf8"))
+        return m
+    def apply(self, text: str):
+        # Remove a sentence with nothing
+        text = text.strip()
+        if not text:
+            return False
+        # Exact Dedup
+        exact_hash = hashlib.sha256(
+            text.encode("utf8")
+        ).hexdigest()
+        if exact_hash in self.exact_hashes:
+            return False
+        self.exact_hashes.add(exact_hash)
+        # Near Dedup
+        m = self.get_minhash(text)
+        result = self.lsh.query(m)
+        if result:
+            return False
+        self.count += 1
+        self.lsh.insert(str(self.count), m)
+        return True

purism-1.0.0/purism/filters/base_filter.py ADDED Viewed

@@ -0,0 +1,6 @@
+from abc import ABC, abstractmethod
+class BaseFilter(ABC):
+    @abstractmethod
+    def apply(self, text: str) -> bool:
+        pass

purism-1.0.0/purism/filters/model_filter.py ADDED Viewed

@@ -0,0 +1,55 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from .base_filter import BaseFilter
+class PPLFilter(BaseFilter):
+    def __init__(self, ppl_threshold=180.0):
+        model_id = "LiquidAI/LFM2.5-1.2B-Instruct"
+        self.quant_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map="auto",
+            dtype="auto",
+            quantization_config=self.quant_config
+        )
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.drop_ratio = ppl_threshold
+    def compute_ppl(self, text: str):
+        device = next(
+            self.model.parameters()
+        ).device
+        enc = self.tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512
+        ).to(device)
+        with torch.no_grad():
+            output = self.model(
+                **enc,
+                labels=enc["input_ids"]
+            )
+            loss = output.loss
+        ppl = torch.exp(loss)
+        if torch.isinf(ppl):
+            return float("inf")
+        return ppl.item()
+    def apply(self, text: str):
+        text = text.strip()
+        if len(text) < 10:
+            return False
+        ppl = self.compute_ppl(text)
+        return ppl <= self.drop_ratio

purism-1.0.0/purism/filters/simple_filter.py ADDED Viewed

@@ -0,0 +1,116 @@
+# Load libraries
+import re
+from .base_filter import BaseFilter
+from importlib.resources import files
+# If the length is too long or too short, remove it
+class LengthFilter(BaseFilter):
+    def __init__(self, min_len=50, max_len=10000):
+        self.min_len = min_len
+        self.max_len = max_len
+    def apply(self, text: str):
+        actual_len = len(text.strip()) # Remove the blank
+        if self.min_len <= actual_len <= self.max_len:
+            return True
+        return False
+# Remove any inappropriate words
+# You can customize settings by editing the .txt file.
+class HarmfulWordsFilter(BaseFilter):
+    def __init__(self, threshold=5):
+        self.filepath = files("purism.resources").joinpath(
+            "harmful_words.txt"
+        )
+        self.pattern = self.load_and_compile()
+        self.threshold = threshold
+    def load_and_compile(self):
+        # Open .txt file which contains harmful words
+        with open(self.filepath, 'r', encoding='utf-8') as f:
+            words = sorted(list(set(line.strip() for line in f if line.strip())), key=len, reverse=True)
+        # If the file is empty, stop running.
+        if not words:
+            raise ValueError("It seems the list of forbidden words is empty.")
+        combined_pattern = '|'.join(map(re.escape, words))
+        return re.compile(combined_pattern)
+    def apply(self, text: str):
+        if not self.pattern:
+            return False
+        # Detects and stops only a set number of times for speed
+        count = 0
+        for _ in self.pattern.finditer(text):
+            count += 1
+            if count >= self.threshold:
+                return False
+        return True
+class SpamWordsFilter(BaseFilter):
+    def __init__(self, threshold=8):
+        self.filepath = files("purism.resources").joinpath(
+            "spam_words.txt"
+        )
+        self.pattern = self.load_and_compile()
+        self.threshold = threshold
+    def load_and_compile(self):
+        # Open .txt file which contains harmful words
+        with open(self.filepath, 'r', encoding='utf-8') as f:
+            words = sorted(list(set(line.strip() for line in f if line.strip())), key=len, reverse=True)
+        # If the file is empty, stop running.
+        if not words:
+            raise ValueError("It seems the list of forbidden words is empty.")
+        combined_pattern = '|'.join(map(re.escape, words))
+        return re.compile(combined_pattern)
+    def apply(self, text: str):
+        if not self.pattern:
+            return False
+        # Detects and stops only a set number of times for speed
+        count = 0
+        for _ in self.pattern.finditer(text):
+            count += 1
+            if count >= self.threshold:
+                return False
+        return True
+# If text have used too many symbols, remove it
+class SignAbuseFilter(BaseFilter):
+    def __init__(self, threshold=0.3):
+        self.threshold = threshold
+        self.signabuse = re.compile(r'[^a-zA-Z0-9가-힣\s]')
+    def apply(self, text: str):
+        len_all = len(text)
+        len_sign = len(self.signabuse.findall(text))
+        if len_all == 0:
+            return False
+        if len_sign / len_all >= self.threshold:
+            return False
+        return True
+# If personal information is included, remove it
+class PIIFilter(BaseFilter):
+    def __init__(self):
+        # Compiling key forms of personal information
+        self.pii_patterns = {
+            "resident_number": re.compile(r'\d{2}([01]\d[0123]\d)-?[1-4]\d{6}'),
+            "phone_number": re.compile(r'01[016789]-?\d{3,4}-?\d{4}'),
+            "email": re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
+            "card_number": re.compile(r'(?:\d{4}[- ]?){3}\d{4}')
+        }
+    def apply(self, text: str):
+        # Return False immediately if any personal information is found
+        for name, pattern in self.pii_patterns.items():
+            if pattern.search(text):
+                return False
+        return True

purism-1.0.0/purism/normalizers/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .normalizer import UnicodeCleaner, UICleaner, TextCleaner
+__all__ = [
+    "UnicodeCleaner", "UICleaner", "TextCleaner"
+]

purism-1.0.0/purism/normalizers/base_normalizer.py ADDED Viewed

@@ -0,0 +1,6 @@
+from abc import ABC, abstractmethod
+class BaseNormalizer(ABC):
+    @abstractmethod
+    def normalize(self, text: str) -> str:
+        pass

purism-1.0.0/purism/normalizers/normalizer.py ADDED Viewed

@@ -0,0 +1,54 @@
+# Load libraries
+from .base_normalizer import BaseNormalizer
+import re
+import unicodedata
+import html
+from ftfy import fix_text
+# Unicode-based text normalization
+class UnicodeCleaner(BaseNormalizer):
+    def __init__(self, type="NFC"):
+        method = ["NFC", "NFD", "NFKC", "NFKD"]
+        if type in method:
+            self.unicode_type = type
+        else:
+            raise ValueError("Invalid Unicode Normalization method.")
+    def normalize(self, text: str) -> str:
+        if not text:
+            return ""
+        cleaned_text = unicodedata.normalize(self.unicode_type, text)
+        return cleaned_text
+# Remove useless sentences such as HTML tags
+class UICleaner(BaseNormalizer):
+    def __init__(self):
+        self.html_tag_re = re.compile(r'<[^>]+>')
+        self.control_char_re = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
+    def normalize(self, text: str) -> str:
+        if not text:
+            return ""
+        text = html.unescape(text)
+        text = self.html_tag_re.sub(" ", text)
+        text = self.control_char_re.sub("", text)
+        return text
+# Reduce too many identical characters
+class TextCleaner(BaseNormalizer):
+    def __init__(self):
+        self.whitespace_re = re.compile(r'\s+')
+        self.repeat_re = re.compile(r"(ㅋ|ㅎ|ㅠ|ㅜ|!|\.)\1{2,}")
+    def normalize(self, text: str) -> str:
+        if not text:
+            return ""
+        text = fix_text(text)
+        text = self.repeat_re.sub(r"\1\1", text)
+        text = self.whitespace_re.sub(" ", text).strip()
+        return text

purism-1.0.0/purism/pipeline/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .pipeline import PurifyConfig
+__all__ = [
+    "PurifyConfig"
+]

purism-1.0.0/purism/pipeline/pipeline.py ADDED Viewed

@@ -0,0 +1,31 @@
+# Import all created Python codes from this repository
+from purism.normalizers.normalizer import TextCleaner, UICleaner, UnicodeCleaner
+from purism.filters.simple_filter import LengthFilter, HarmfulWordsFilter, SpamWordsFilter, SignAbuseFilter, PIIFilter
+from purism.filters.advanced_filter import LanguageFilter
+from purism.filters.model_filter import PPLFilter
+# Setting Settings for Data Purification
+class PurifyConfig():
+    def __init__(self, filters, normalizer):
+        self.normalizer = normalizer
+        self.filters = filters
+    def purify(self, text: str):
+        for normalizer in self.normalizer:
+            text_cleaned = normalizer.normalize(text)
+        for filter in self.filters:
+            if not filter.apply(text_cleaned):
+                return {
+                    "raw_text": text,
+                    "passed": False,
+                    "filtered_by": filter.__class__.__name__,
+                    "normalized_text": text_cleaned
+                }
+        return {
+            "raw_text": text,
+            "passed": True,
+            "filtered_by": None,
+            "normalized_text": text_cleaned
+        }

purism-1.0.0/purism/resources/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

purism-1.0.0/purism/resources/harmful_words.txt ADDED Viewed

@@ -0,0 +1,108 @@
+강간
+개새끼
+개자식
+개좆
+개차반
+거유
+계집년
+고자
+근친
+노모
+니기미
+뒤질래
+딸딸이
+때씹
+또라이
+뙤놈
+로리타
+망가
+몰카
+미친
+바바리맨
+변태
+병신
+보지
+불알
+빠구리
+사까시
+섹스
+스와핑
+쌍놈
+씨발
+씨발놈
+씨팔
+씹
+씹물
+씹빨
+씹새끼
+씹알
+씹창
+씹팔
+암캐
+애자
+야동
+야사
+야애니
+엄창
+에로
+염병
+옘병
+유모
+육갑
+은꼴
+자위
+자지
+잡년
+종간나
+좆
+좆만
+죽일년
+쥐좆
+직촬
+짱깨
+쪽바리
+창녀
+포르노
+하드코어
+호로
+화냥년
+후레아들
+후장
+희쭈그리
+바카라
+바다이야기
+카지노
+출장안마
+출장마사지
+출장맛사지
+안마방
+콜걸
+바둑이
+황금성
+홀덤바
+빠징코
+엉밑살
+ㅗㅜㅑ
+ㅅㅅ
+ㅅㅂ
+ㅂㅅ
+ㄱㅅㄲ
+좆까
+ㅈ까
+조까
+음란마귀
+토렌트
+대딸방
+급딸
+오르가즘
+포커
+슬롯머신
+토토
+메갈
+한남
+전라디언
+일베
+성인용품
+일베충
+느금마
+니엄마

purism-1.0.0/purism/resources/spam_words.txt ADDED Viewed

@@ -0,0 +1,30 @@
+문의
+전화문의
+텔레그램
+DM
+인스타그램
+디엠
+카톡
+클릭
+바로가기
+댓글
+답글
+추천
+비추천
+로그인
+로그아웃
+회원가입
+이메일문의
+간편로그인
+소셜로그인
+본인인증
+장바구니
+배송조회
+이전글
+다음글
+메뉴
+카테고리
+공지사항
+닫기
+사이트맵
+홈

purism-1.0.0/purism.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,27 @@
+Metadata-Version: 2.4
+Name: purism
+Version: 1.0.0
+Summary: Automatic data filtering library specialized in Korean data purification
+Author: Lumia101
+Project-URL: Homepage, https://github.com/Lumia101/purism/tree/main
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch
+Requires-Dist: transformers
+Requires-Dist: bitsandbytes
+Requires-Dist: lingua-language-detector
+Requires-Dist: datasketch[redis]
+Requires-Dist: ftfy
+Dynamic: license-file
+# Purism: Automatic data filtering library specialized in Korean data purification
+> **Puri**fy **s**yste**m**
+## Summary
+This repository is an automatic data filtering library specialized in Korean data purification.
+# Quickstart
+Coming soon...
+## API

purism-1.0.0/purism.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,22 @@
+LICENSE
+README.md
+pyproject.toml
+purism/__init__.py
+purism.egg-info/PKG-INFO
+purism.egg-info/SOURCES.txt
+purism.egg-info/dependency_links.txt
+purism.egg-info/requires.txt
+purism.egg-info/top_level.txt
+purism/filters/__init__.py
+purism/filters/advanced_filter.py
+purism/filters/base_filter.py
+purism/filters/model_filter.py
+purism/filters/simple_filter.py
+purism/normalizers/__init__.py
+purism/normalizers/base_normalizer.py
+purism/normalizers/normalizer.py
+purism/pipeline/__init__.py
+purism/pipeline/pipeline.py
+purism/resources/__init__.py
+purism/resources/harmful_words.txt
+purism/resources/spam_words.txt

purism-1.0.0/purism.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

purism-1.0.0/purism.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,6 @@
+torch
+transformers
+bitsandbytes
+lingua-language-detector
+datasketch[redis]
+ftfy

purism-1.0.0/purism.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ dist
2	+ purism

purism-1.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,30 @@
+[build-system]
+requires = ["setuptools>=82.0.1"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "purism"
+version = "1.0.0"
+description = "Automatic data filtering library specialized in Korean data purification"
+authors = [
+  {name="Lumia101"}
+]
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "torch",
+    "transformers",
+    "bitsandbytes",
+    "lingua-language-detector",
+    "datasketch[redis]",
+    "ftfy"
+]
+[project.urls]
+"Homepage" = "https://github.com/Lumia101/purism/tree/main"
+[tool.setuptools.packages.find]
+where = ["."]
+[tool.setuptools.package-data]
+"purism.resources" = ["*.txt"]

purism-1.0.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0