PyPI - nltkor - Versions diffs - 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl - Mend

nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

nltkor/Kor_char.py +193 -0
nltkor/__init__.py +16 -0
nltkor/alignment/__init__.py +1315 -0
nltkor/cider/__init__.py +2 -0
nltkor/cider/cider.py +55 -0
nltkor/cider/cider_scorer.py +207 -0
nltkor/distance/__init__.py +441 -0
nltkor/distance/wasserstein.py +126 -0
nltkor/etc.py +22 -0
nltkor/lazyimport.py +144 -0
nltkor/make_requirement.py +11 -0
nltkor/metrics/__init__.py +63 -0
nltkor/metrics/bartscore.py +301 -0
nltkor/metrics/bertscore.py +331 -0
nltkor/metrics/bleu_tensor.py +20 -0
nltkor/metrics/classical.py +847 -0
nltkor/metrics/entment.py +24 -0
nltkor/metrics/eval.py +517 -0
nltkor/metrics/mauve.py +273 -0
nltkor/metrics/mauve_utils.py +131 -0
nltkor/misc/__init__.py +11 -0
nltkor/misc/string2string_basic_functions.py +59 -0
nltkor/misc/string2string_default_tokenizer.py +83 -0
nltkor/misc/string2string_hash_functions.py +159 -0
nltkor/misc/string2string_word_embeddings.py +503 -0
nltkor/search/__init__.py +10 -0
nltkor/search/classical.py +569 -0
nltkor/search/faiss_search.py +787 -0
nltkor/search/kobert_tokenizer.py +181 -0
nltkor/sejong/__init__.py +3 -0
nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
nltkor/sejong/ch.py +12 -0
nltkor/sejong/dict_semClassNum.txt +491 -0
nltkor/sejong/layer.txt +630 -0
nltkor/sejong/sejong_download.py +87 -0
nltkor/sejong/ssem.py +684 -0
nltkor/similarity/__init__.py +3 -0
nltkor/similarity/bartscore____.py +337 -0
nltkor/similarity/bertscore____.py +339 -0
nltkor/similarity/classical.py +245 -0
nltkor/similarity/cosine_similarity.py +175 -0
nltkor/tag/__init__.py +71 -0
nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
nltkor/tag/espresso_tag.py +220 -0
nltkor/tag/libs/__init__.py +10 -0
nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
nltkor/tag/libs/arguments.py +280 -0
nltkor/tag/libs/attributes.py +231 -0
nltkor/tag/libs/config.py +159 -0
nltkor/tag/libs/metadata.py +129 -0
nltkor/tag/libs/ner/__init__.py +2 -0
nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
nltkor/tag/libs/ner/macmorphoreader.py +7 -0
nltkor/tag/libs/ner/ner_reader.py +92 -0
nltkor/tag/libs/network.c +72325 -0
nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
nltkor/tag/libs/network.pyx +878 -0
nltkor/tag/libs/networkconv.pyx +1028 -0
nltkor/tag/libs/networkdependencyconv.pyx +451 -0
nltkor/tag/libs/parse/__init__.py +1 -0
nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
nltkor/tag/libs/parse/parse_reader.py +283 -0
nltkor/tag/libs/pos/__init__.py +2 -0
nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
nltkor/tag/libs/pos/macmorphoreader.py +7 -0
nltkor/tag/libs/pos/pos_reader.py +97 -0
nltkor/tag/libs/reader.py +485 -0
nltkor/tag/libs/srl/__init__.py +3 -0
nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
nltkor/tag/libs/srl/__srl_reader_.py +535 -0
nltkor/tag/libs/srl/srl_reader.py +436 -0
nltkor/tag/libs/srl/train_srl.py +87 -0
nltkor/tag/libs/taggers.py +926 -0
nltkor/tag/libs/utils.py +384 -0
nltkor/tag/libs/word_dictionary.py +239 -0
nltkor/tag/libs/wsd/__init__.py +2 -0
nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
nltkor/tag/libs/wsd/wsd_reader.py +93 -0
nltkor/tokenize/__init__.py +62 -0
nltkor/tokenize/ko_tokenize.py +115 -0
nltkor/trans.py +121 -0
nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
nltkor-1.2.14.dist-info/METADATA +41 -0
nltkor-1.2.14.dist-info/RECORD +127 -0
nltkor-1.2.14.dist-info/WHEEL +5 -0
nltkor-1.2.14.dist-info/top_level.txt +1 -0

nltkor/tag/libs/wsd/wsd_reader.py ADDED Viewed

@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+"""
+Class for dealing with WSD data.
+"""
+from ..reader import TaggerReader
+class ConllWSD(object):
+	"""
+	Dummy class for storing column positions in a conll file.
+	"""
+	id = 0
+	word = 1
+	pos = 2
+	wsd = 3
+	SEP = '\t'
+class WSDReader(TaggerReader):
+	"""
+	This class reads data from a POS corpus and turns it into a format
+	readable by the neural network for the POS tagging task.
+	"""
+	def __init__(self, md=None, filename=None, load_dictionaries=True):
+		"""
+		Constructor
+		"""
+		self.rare_tag = None
+		self.sentences = []
+		if filename is not None:
+			try:
+				self._read_plain(filename)
+			except:
+				self._read_conll(filename)
+		super(WSDReader, self).__init__(md, load_dictionaries=load_dictionaries)
+	@property
+	def task(self):
+		"""
+		Abstract Base Class (ABC) attribute.
+		"""
+		return 'wsd'
+	def _read_plain(self, filename):
+		"""
+		Read data from a "plain" file, with one sentence per line, each token
+		as token_tag.
+		"""
+		self.sentences = []
+		with open(filename, 'rt') as f:
+			for line in f:
+				#line = unicode(line, 'utf-8')
+				items = line.strip().split()
+				sentence = []
+				for item in items:
+					token, tag = item.rsplit('_', 1)
+					sentence.append((token, tag))
+				self.sentences.append(sentence)
+	def _read_conll(self, filename):
+		"""
+		Read data from a CoNLL formatted file. It expects at least 4 columns:
+		id, surface word, lemma (ignored, may be anything)
+		and the POS tag.
+		"""
+		self.sentences = []
+		sentence = []
+		with open(filename, 'rt') as f:
+			for line in f:
+				line = line.strip()
+				if line == '':
+					if len(sentence) > 0:
+						self.sentences.append(sentence)
+						sentence = []
+						continue
+				fields = line.split(ConllWSD.SEP)
+				try:
+					word = fields[ConllWSD.word]
+					pos = fields[ConllWSD.pos]
+					wsd = fields[ConllWSD.wsd]
+				except: continue
+				sentence.append((word, wsd))
+				#sentence.append((word, pos, ner))
+		if len(sentence) > 0:
+			self.sentences.append(sentence)
+# backwards compatibility
+MacMorphoReader = WSDReader

nltkor/tokenize/__init__.py ADDED Viewed

@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2020 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# Contributors: matthewmc, clouds56
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+r"""
+NLTK Tokenizer Package
+Tokenizers divide strings into lists of substrings.  For example,
+tokenizers can be used to find the words and punctuation in a string:
+    >>> from nltk.tokenize import word_tokenize
+    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+    ... two of them.\n\nThanks.'''
+    >>> word_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+This particular tokenizer requires the Punkt sentence tokenization
+models to be installed. NLTK also provides a simpler,
+regular-expression based tokenizer, which splits text on whitespace
+and punctuation:
+    >>> from nltk.tokenize import wordpunct_tokenize
+    >>> wordpunct_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+We can also operate at the level of sentences, using the sentence
+tokenizer directly as follows:
+    >>> from nltk.tokenize import sent_tokenize, word_tokenize
+    >>> sent_tokenize(s)
+    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
+    >>> [word_tokenize(t) for t in sent_tokenize(s)]
+    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
+    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
+Caution: when tokenizing a Unicode string, make sure you are not
+using an encoded version of the string (it may be necessary to
+decode it first, e.g. with ``s.decode("utf8")``.
+NLTK tokenizers can produce token-spans, represented as tuples of integers
+having the same semantics as string slices, to support efficient comparison
+of tokenizers.  (These methods are implemented as generators.)
+    >>> from nltk.tokenize import WhitespaceTokenizer
+    >>> list(WhitespaceTokenizer().span_tokenize(s))
+    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
+    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
+There are numerous ways to tokenize text.  If you need more control over
+tokenization, see the other methods provided in this package.
+For further information, please see Chapter 3 of the NLTK book.
+"""
+from nltkor.tokenize.ko_tokenize import Ko_tokenize

nltkor/tokenize/ko_tokenize.py ADDED Viewed

@@ -0,0 +1,115 @@
+# Natural Language Toolkit for Korean: NLTKor's very own tokenizer.
+#
+# Copyright (C) 2001-2020 NLTKor Project
+# Author:
+# URL: <http://>
+# For license information, see LICENSE.TXT
+import re
+#for Korean
+class Enum(object):
+    def __init__(self, names):
+        for value, name in enumerate(names.split()): setattr(self, name, value)
+class Ko_tokenize():
+    def word(target, encoding='utf8'):
+        """ Word Tokenizer
+        단어 단위로 Tokenizing 한다.
+        인자값 목록 (모든 변수가 반드시 필요):
+        target : Tokenizing 하고자 하는 대상 문자열
+        결과값 : 토크나이징 결과를 list 자료형으로 넘김
+        """
+        isHangulSyllables = lambda x: unicodedata.name(x).find("HANGUL SYLLABLE") == 0
+        isHanjaSyllables = lambda x: unicodedata.name(x).find("CJK") == 0
+        isNumber = lambda x: unicodedata.name(x).find("FULLWIDTH DIGIT") == 0 or unicodedata.name(x).find("DIGIT") == 0
+        isAlphabet = lambda x: unicodedata.name(x).find("FULLWIDTH LATIN") == 0 or unicodedata.name(x).find("LATIN") == 0
+        isAlphabet_Connection = lambda x: x in (".", "-", "_", "|")
+        isNumber_Connection = lambda x: x in (".", ",")
+        isPunctuation = lambda x: unicodedata.category(x)[0] == "P"
+        isSymbol = lambda x: unicodedata.category(x)[0] == "S"
+        getCategory = lambda x: unicodedata.category(x)
+        TYPE = Enum("UNKNOWN SYMBOL NUMBER PUNCTUATION ALPHABET HANJA HANGUL")
+        buf = str()
+        type_prev = 0
+        type_cur = 0
+        if type(target) == str:
+            target = target
+        for i in range(len(target)):
+            ch = target[i]
+            ca = str()
+            try:
+                if isHangulSyllables(ch): type_cur = TYPE.HANGUL
+                elif isHanjaSyllables(ch): type_cur = TYPE.HANJA
+                elif isNumber(ch): type_cur = TYPE.NUMBER
+                elif isAlphabet(ch): type_cur = TYPE.ALPHABET
+                elif isAlphabet_Connection(ch) and type_prev == TYPE.ALPHABET:
+                    if i+1 < len(target) and not isAlphabet(target[i+1]): type_cur = TYPE.SYMBOL
+                    else: type_cur = TYPE.ALPHABET
+                elif isNumber_Connection(ch) and type_prev == TYPE.NUMBER:
+                    if i+1 < len(target) and not isNumber(target[i+1]): type_cur = TYPE.SYMBOL
+                    elif i+1 == len(target): type_cur = TYPE.SYMBOL
+                    else: type_cur = TYPE.NUMBER
+                elif isPunctuation(ch): type_cur = TYPE.PUNCTUATION
+                elif isSymbol(ch): type_cur = TYPE.SYMBOL
+                else: type_cur = TYPE.UNKNOWN
+                ca = getCategory(ch)
+            except:
+                type_cur = TYPE.UNKNOWN
+            if type_cur == TYPE.PUNCTUATION:
+                if ca in ("Ps", "Pe"): buf += " "
+                elif i >= 0 and i < len(target) and target[i-1] != target[i]: buf += " "
+            elif type_cur != type_prev: buf += " "
+            buf += ch
+            type_prev = type_cur
+        return buf.split()
+    def syllable(text,blank=False):
+        """
+        음절 토크나이저
+        음절단위로 tokenizing
+        박찬양
+        """
+        emjeol_list = list()
+        for emjeol in text:
+          if blank and (emjeol not in ['\n']):
+           emjeol_list.append(emjeol)
+          elif emjeol not in [' ', '\n']:
+           emjeol_list.append(emjeol)
+        return emjeol_list
+    def sentence(text):
+        """
+        문장 토크나이저
+        문장단위로 tokenizing
+        """
+        txt=text.replace("\n"," ")
+        p=re.compile(r'(?<!\w\.\w.)(?<=\.|\?|\!)\s').split(txt)
+        result=[]
+        for tmp in p:
+            if (tmp == ' ' or tmp== ''):
+                continue
+            else:result.append(tmp.strip(" "))
+        return result

nltkor/trans.py ADDED Viewed

@@ -0,0 +1,121 @@
+import requests
+import json
+from datetime import datetime
+import hmac
+import base64
+import uuid
+import time
+import re
+from bs4 import BeautifulSoup as bs
+class papago:
+	def __init__(self):
+		response=requests.get('https://papago.naver.com')
+		html=bs(response.text,'html.parser')
+		pattern1=r'/vendors~main.*chunk.js'
+		for tmp in html.find_all('script'):
+			tmp=str(tmp)
+			m=re.search(pattern1,tmp)
+			if m is not None:
+				a=m.group()
+		js_url='https://papago.naver.com'+str(a)
+		rest=requests.get(js_url)
+		org=rest.text
+		pattern2=r'AUTH_KEY:[\s]*"[\w.]+"'
+		self.match=str(re.findall(pattern2,org)).split('"')[1]
+	# headers 보안키 생성
+	def hmac_md5(self,key, s):
+		return base64.b64encode(hmac.new(key.encode('utf-8'), s.encode('utf-8'), 'MD5').digest()).decode()
+	def translate(self,data,source,target):
+		url = 'https://papago.naver.com/apis/n2mt/translate'
+		AUTH_KEY = self.match
+		dt = datetime.now()
+		timestamp = str(round(dt.timestamp()*1000))
+		# 고정 값을 사용할 시 서버로 부터 차단을 방지
+		deviceId = str(uuid.uuid4())
+		headers = {
+				'authorization': 'PPG ' + deviceId + ':' + self.hmac_md5(AUTH_KEY, deviceId + '\n' + url + '\n' + timestamp),
+				'timestamp': timestamp
+				}
+		form_data = {
+				'deviceId': deviceId,
+				'locale': 'ko',
+				'dict': 'true',
+				'dictDisplay': 30,
+				'honorific': 'false',
+				'instant': 'false',
+				'paging': 'false',
+				'source': source,
+				'target': target,
+				'text': data
+				}
+		res_data = requests.post(url, data=form_data, headers=headers)
+		#papago 번역 결과물 전체 확인
+		#print("\n\n\n",res_data.json())
+		return res_data.json()['translatedText']
+	def e2k(self,sent_list):
+		patient = 0
+		return_list=[]
+		for line in sent_list:
+			line = line.strip()
+			try:
+				text = self.translate(line,'en','ko') ## translatin
+			except (KeyError,requests.exceptions.ConnectionError) as e:
+				if patient > 5:
+					ofp.close()
+					exit() ## Error가 5번 이상 누적되면 종료
+				patient += 1
+				time.sleep(30) ## 에러 발생 시 1시간 대기
+				continue
+			return_list.append(text)
+		#print(json.dumps(result, ensure_ascii=False), flush=True, file=ofp) ## json line 형식으로 저장
+		return return_list
+	def k2e(self,sent_list):
+		patient = 0
+		return_list=[]
+		for line in sent_list:
+			line = line.strip()
+			try:
+				text = self.translate(line,'ko','en') ## translatin
+			except (KeyError,requests.exceptions.ConnectionError) as e:
+				if patient > 5:
+					ofp.close()
+					exit() ## Error가 5번 이상 누적되면 종료
+				patient += 1
+				time.sleep(30) ## 에러 발생 시 1시간 대기
+				continue
+			return_list.append(text)
+		#print(json.dumps(result, ensure_ascii=False), flush=True, file=ofp) ## json line 형식으로 저장
+		return return_list