nltkor 1.2.14__cp311-cp311-macosx_13_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nltkor/Kor_char.py +193 -0
- nltkor/__init__.py +16 -0
- nltkor/alignment/__init__.py +1315 -0
- nltkor/cider/__init__.py +2 -0
- nltkor/cider/cider.py +55 -0
- nltkor/cider/cider_scorer.py +207 -0
- nltkor/distance/__init__.py +441 -0
- nltkor/distance/wasserstein.py +126 -0
- nltkor/etc.py +22 -0
- nltkor/lazyimport.py +144 -0
- nltkor/make_requirement.py +11 -0
- nltkor/metrics/__init__.py +63 -0
- nltkor/metrics/bartscore.py +301 -0
- nltkor/metrics/bertscore.py +331 -0
- nltkor/metrics/bleu_tensor.py +20 -0
- nltkor/metrics/classical.py +847 -0
- nltkor/metrics/entment.py +24 -0
- nltkor/metrics/eval.py +517 -0
- nltkor/metrics/mauve.py +273 -0
- nltkor/metrics/mauve_utils.py +131 -0
- nltkor/misc/__init__.py +11 -0
- nltkor/misc/string2string_basic_functions.py +59 -0
- nltkor/misc/string2string_default_tokenizer.py +83 -0
- nltkor/misc/string2string_hash_functions.py +159 -0
- nltkor/misc/string2string_word_embeddings.py +503 -0
- nltkor/search/__init__.py +10 -0
- nltkor/search/classical.py +569 -0
- nltkor/search/faiss_search.py +787 -0
- nltkor/search/kobert_tokenizer.py +181 -0
- nltkor/sejong/__init__.py +3 -0
- nltkor/sejong/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/sejong_download.cpython-39.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-38.pyc +0 -0
- nltkor/sejong/__pycache__/ssem.cpython-39.pyc +0 -0
- nltkor/sejong/ch.py +12 -0
- nltkor/sejong/dict_semClassNum.txt +491 -0
- nltkor/sejong/layer.txt +630 -0
- nltkor/sejong/sejong_download.py +87 -0
- nltkor/sejong/ssem.py +684 -0
- nltkor/similarity/__init__.py +3 -0
- nltkor/similarity/bartscore____.py +337 -0
- nltkor/similarity/bertscore____.py +339 -0
- nltkor/similarity/classical.py +245 -0
- nltkor/similarity/cosine_similarity.py +175 -0
- nltkor/tag/__init__.py +71 -0
- nltkor/tag/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-38.pyc +0 -0
- nltkor/tag/__pycache__/espresso_tag.cpython-39.pyc +0 -0
- nltkor/tag/espresso_tag.py +220 -0
- nltkor/tag/libs/__init__.py +10 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/attributes.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/config.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/metadata.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/taggers.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/utils.cpython-39.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-38.pyc +0 -0
- nltkor/tag/libs/__pycache__/word_dictionary.cpython-39.pyc +0 -0
- nltkor/tag/libs/arguments.py +280 -0
- nltkor/tag/libs/attributes.py +231 -0
- nltkor/tag/libs/config.py +159 -0
- nltkor/tag/libs/metadata.py +129 -0
- nltkor/tag/libs/ner/__init__.py +2 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/ner/__pycache__/ner_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/ner/macmorphoreader.py +7 -0
- nltkor/tag/libs/ner/ner_reader.py +92 -0
- nltkor/tag/libs/network.c +72325 -0
- nltkor/tag/libs/network.cpython-311-darwin.so +0 -0
- nltkor/tag/libs/network.pyx +878 -0
- nltkor/tag/libs/networkconv.pyx +1028 -0
- nltkor/tag/libs/networkdependencyconv.pyx +451 -0
- nltkor/tag/libs/parse/__init__.py +1 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/parse/__pycache__/parse_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/parse/parse_reader.py +283 -0
- nltkor/tag/libs/pos/__init__.py +2 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/pos/__pycache__/pos_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/pos/macmorphoreader.py +7 -0
- nltkor/tag/libs/pos/pos_reader.py +97 -0
- nltkor/tag/libs/reader.py +485 -0
- nltkor/tag/libs/srl/__init__.py +3 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/srl_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-38.pyc +0 -0
- nltkor/tag/libs/srl/__pycache__/train_srl.cpython-39.pyc +0 -0
- nltkor/tag/libs/srl/__srl_reader_.py +535 -0
- nltkor/tag/libs/srl/srl_reader.py +436 -0
- nltkor/tag/libs/srl/train_srl.py +87 -0
- nltkor/tag/libs/taggers.py +926 -0
- nltkor/tag/libs/utils.py +384 -0
- nltkor/tag/libs/word_dictionary.py +239 -0
- nltkor/tag/libs/wsd/__init__.py +2 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/__init__.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-38.pyc +0 -0
- nltkor/tag/libs/wsd/__pycache__/wsd_reader.cpython-39.pyc +0 -0
- nltkor/tag/libs/wsd/macmorphoreader.py +7 -0
- nltkor/tag/libs/wsd/wsd_reader.py +93 -0
- nltkor/tokenize/__init__.py +62 -0
- nltkor/tokenize/ko_tokenize.py +115 -0
- nltkor/trans.py +121 -0
- nltkor-1.2.14.dist-info/LICENSE.txt +1093 -0
- nltkor-1.2.14.dist-info/METADATA +41 -0
- nltkor-1.2.14.dist-info/RECORD +127 -0
- nltkor-1.2.14.dist-info/WHEEL +5 -0
- nltkor-1.2.14.dist-info/top_level.txt +1 -0
@@ -0,0 +1,220 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
# import sys
|
4
|
+
# sys.path.append("/Users/dowon/nltk_ko/nltk/tag")
|
5
|
+
# from libs import *
|
6
|
+
|
7
|
+
"""
|
8
|
+
This script will run a POS or SRL tagger on the input data and print the results
|
9
|
+
to stdout.
|
10
|
+
"""
|
11
|
+
|
12
|
+
import argparse
|
13
|
+
import logging
|
14
|
+
|
15
|
+
if __package__ is None: # 스크립트로 실행할 때
|
16
|
+
import sys
|
17
|
+
from os import path
|
18
|
+
#print(path.dirname( path.dirname( path.abspath(__file__) ) ))
|
19
|
+
sys.path.append(path.dirname( path.dirname( path.abspath(__file__) ) ))
|
20
|
+
from libs import *
|
21
|
+
else:
|
22
|
+
from .libs import *
|
23
|
+
#from .libs import *
|
24
|
+
import requests
|
25
|
+
import zipfile
|
26
|
+
|
27
|
+
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere
|
28
|
+
|
29
|
+
|
30
|
+
class EspressoTagger:
|
31
|
+
def __init__(self, data_dir=None, task="pos"):
|
32
|
+
self.data_dir = data_dir
|
33
|
+
if data_dir == None:
|
34
|
+
path=os.path.dirname(__file__)
|
35
|
+
path= path + '/data'
|
36
|
+
self.data_dir = path
|
37
|
+
#print(path)
|
38
|
+
self.path = ""
|
39
|
+
self.tagger = None
|
40
|
+
|
41
|
+
self.task = task.lower()
|
42
|
+
if not self._check_model():
|
43
|
+
self._download_model()
|
44
|
+
|
45
|
+
set_data_dir(self.data_dir)
|
46
|
+
|
47
|
+
if self.task == 'pos':
|
48
|
+
self.tagger = taggers.POSTagger(data_dir=self.data_dir)
|
49
|
+
elif self.task == 'ner':
|
50
|
+
self.tagger = taggers.NERTagger(data_dir=self.data_dir)
|
51
|
+
elif self.task == 'wsd':
|
52
|
+
self.tagger = taggers.WSDTagger(data_dir=self.data_dir)
|
53
|
+
elif self.task == 'srl':
|
54
|
+
self.tagger = taggers.SRLTagger(data_dir=self.data_dir)
|
55
|
+
elif self.task == 'dependency':
|
56
|
+
self.tagger = taggers.DependencyParser(data_dir=self.data_dir)
|
57
|
+
else:
|
58
|
+
raise ValueError('Unknown task: %s' % self.task)
|
59
|
+
|
60
|
+
|
61
|
+
def tag(self, text, use_sent_tokenizer=True, lemma=True):
|
62
|
+
"""
|
63
|
+
This function provides an interactive environment for running the system.
|
64
|
+
It receives text from the standard input, tokenizes it, and calls the function
|
65
|
+
given as a parameter to produce an answer.
|
66
|
+
|
67
|
+
:param task: 'pos', ner', 'wsd', 'srl' or 'dependency'
|
68
|
+
:param use_tokenizer: whether to use built-in tokenizer
|
69
|
+
"""
|
70
|
+
|
71
|
+
#use_sent_tokenizer = not use_sent_tokenizer
|
72
|
+
mode = 'standard' if lemma else 'eumjeol'
|
73
|
+
result = self.tagger.tag(text, use_sent_tokenizer, mode)
|
74
|
+
'''
|
75
|
+
else:
|
76
|
+
tokens = text.split()
|
77
|
+
if self.task != 'dependency':
|
78
|
+
result = [self.tagger.tag_tokens(tokens, True)]
|
79
|
+
else:
|
80
|
+
result = [self.tagger.tag_tokens(tokens)]
|
81
|
+
'''
|
82
|
+
|
83
|
+
return self._result_tagged(result, self.task)
|
84
|
+
|
85
|
+
def _result_tagged(self, tagged_sents, task):
|
86
|
+
"""
|
87
|
+
Prints the tagged text to stdout.
|
88
|
+
|
89
|
+
:param tagged_sents: sentences tagged according to any of espresso taggers.
|
90
|
+
:param task: the tagging task (either 'pos', 'ner', 'wsd', 'srl' or 'dependency')
|
91
|
+
"""
|
92
|
+
|
93
|
+
##TODO: print부분 return으로 변경
|
94
|
+
if task == 'pos':
|
95
|
+
return self._return_tagged_pos(tagged_sents)
|
96
|
+
elif task == 'ner':
|
97
|
+
return self._return_tagged_ner(tagged_sents)
|
98
|
+
elif task == 'wsd':
|
99
|
+
return self._return_tagged_wsd(tagged_sents)
|
100
|
+
elif task == 'srl':
|
101
|
+
return self._return_tagged_srl(tagged_sents)
|
102
|
+
elif task == 'dependency':
|
103
|
+
return self._return_parsed_dependency(tagged_sents)
|
104
|
+
else:
|
105
|
+
raise ValueError('Unknown task: %s' % task)
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
def _return_parsed_dependency(self, parsed_sents):
|
110
|
+
"""Prints one token per line and its head"""
|
111
|
+
result = []
|
112
|
+
temp_list = []
|
113
|
+
temp_list2 = []
|
114
|
+
for sent in parsed_sents:
|
115
|
+
temp_list = sent.to_conll().split('\t')
|
116
|
+
temp_list = temp_list[1:]
|
117
|
+
for ele in temp_list:
|
118
|
+
if '\n' in ele:
|
119
|
+
ele = ele[:ele.find('\n')]
|
120
|
+
temp_list2.append(ele)
|
121
|
+
result.append(self._dependency_after(temp_list2)[:])
|
122
|
+
temp_list2 = []
|
123
|
+
|
124
|
+
return result
|
125
|
+
|
126
|
+
def _return_tagged_pos(self, tagged_sents):
|
127
|
+
"""Prints one sentence per line as token_tag"""
|
128
|
+
result = []
|
129
|
+
for sent in tagged_sents:
|
130
|
+
result = result + list(sent)
|
131
|
+
return result
|
132
|
+
|
133
|
+
def _return_tagged_srl(self, tagged_sents):
|
134
|
+
result = []
|
135
|
+
for sent in tagged_sents:
|
136
|
+
# print (' '.join(sent.tokens))
|
137
|
+
temp_dict1 = {}
|
138
|
+
for predicate, arg_structure in sent.arg_structures:
|
139
|
+
# print ("test 1 :", predicate)
|
140
|
+
# print("te22 :", arg_structure)
|
141
|
+
|
142
|
+
temp_dict2 = {}
|
143
|
+
for label in arg_structure:
|
144
|
+
argument = ' '.join(arg_structure[label])
|
145
|
+
# line = '\t%s: %s' % (label, argument)
|
146
|
+
# print (line)
|
147
|
+
temp_dict2[label] = argument
|
148
|
+
|
149
|
+
# result.append((label, argument))
|
150
|
+
# print ('\n')
|
151
|
+
temp_dict1[predicate] = temp_dict2
|
152
|
+
|
153
|
+
result.append(temp_dict1)
|
154
|
+
|
155
|
+
return result
|
156
|
+
|
157
|
+
def _return_tagged_ner(self, tagged_sents):
|
158
|
+
"""Prints one sentence per line as token_tag"""
|
159
|
+
result = []
|
160
|
+
for sent in tagged_sents:
|
161
|
+
for item in sent:
|
162
|
+
#s = '_'.join(item)
|
163
|
+
result.append(item)
|
164
|
+
|
165
|
+
return result
|
166
|
+
|
167
|
+
def _return_tagged_wsd(self, tagged_sents):
|
168
|
+
"""Prints one sentence per line as token_tag"""
|
169
|
+
result = []
|
170
|
+
for sent in tagged_sents:
|
171
|
+
for item in sent:
|
172
|
+
s = '_'.join(item)
|
173
|
+
result.append(s)
|
174
|
+
|
175
|
+
return result
|
176
|
+
|
177
|
+
def _download_model(self):
|
178
|
+
"""Downloads the model from the server"""
|
179
|
+
temp_path = os.path.dirname(__file__) + '/data.zip'
|
180
|
+
url = "https://air.changwon.ac.kr/~airdemo/storage/espresso_data_1/data.zip"
|
181
|
+
print("Downloading Espresso5 model...")
|
182
|
+
with requests.get(url, stream=True) as r:
|
183
|
+
r.raise_for_status()
|
184
|
+
with open(temp_path, 'wb') as f:
|
185
|
+
for chunk in r.iter_content(chunk_size=8192):
|
186
|
+
# If you have chunk encoded response uncomment if
|
187
|
+
# and set chunk_size parameter to None.
|
188
|
+
#if chunk:
|
189
|
+
f.write(chunk)
|
190
|
+
|
191
|
+
if os.path.exists(self.data_dir):
|
192
|
+
os.rmdir(self.data_dir)
|
193
|
+
|
194
|
+
with zipfile.ZipFile(temp_path, "r") as zip_ref:
|
195
|
+
zip_ref.extractall(os.path.dirname(__file__))
|
196
|
+
|
197
|
+
def _check_model(self):
|
198
|
+
"""Checks if the model is available and downloads it if necessary"""
|
199
|
+
if not os.path.exists(self.data_dir):
|
200
|
+
return False
|
201
|
+
else:
|
202
|
+
return True
|
203
|
+
|
204
|
+
def _dependency_after(self, list):
|
205
|
+
len_list = len(list)
|
206
|
+
temp_list = []
|
207
|
+
repeat = len_list//3
|
208
|
+
for i in range(repeat):
|
209
|
+
index = i*3
|
210
|
+
tup1 = (i+1, )
|
211
|
+
tup2 = tuple(list[index:index+3])
|
212
|
+
tup = tup1 + tup2
|
213
|
+
temp_list.append(tup[:])
|
214
|
+
|
215
|
+
|
216
|
+
return temp_list
|
217
|
+
|
218
|
+
if __name__ == '__main__':
|
219
|
+
tagger = EspressoTagger(task='pos')
|
220
|
+
print(tagger.tag("나는 아름다운 강산에 살고있다."))
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# import to provide easier access for nlpnet users
|
2
|
+
from .config import set_data_dir
|
3
|
+
from . import taggers
|
4
|
+
from . import utils
|
5
|
+
|
6
|
+
from .taggers import POSTagger, NERTagger, WSDTagger, SRLTagger, DependencyParser
|
7
|
+
from .utils import tokenize
|
8
|
+
from .utils import PickleConverter
|
9
|
+
|
10
|
+
__version__ = '1.2.0'
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,280 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
"""
|
4
|
+
Code for argument parsing and a few verifications.
|
5
|
+
These arguments are used by the training script.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import argparse
|
9
|
+
|
10
|
+
|
11
|
+
def fill_defaults(args, defaults_per_task):
|
12
|
+
"""
|
13
|
+
This function fills arguments not explicitly set (left as None)
|
14
|
+
with default values according to the chosen task.
|
15
|
+
|
16
|
+
We can't rely on argparse to it because using subparsers with
|
17
|
+
set_defaults and a parent parser overwrites the defaults.
|
18
|
+
"""
|
19
|
+
task = args.task
|
20
|
+
defaults = defaults_per_task[task]
|
21
|
+
for arg in args.__dict__:
|
22
|
+
if getattr(args, arg) is None and arg in defaults:
|
23
|
+
setattr(args, arg, defaults[arg])
|
24
|
+
|
25
|
+
|
26
|
+
def get_args():
|
27
|
+
parser = argparse.ArgumentParser(description="Train the Espresso "\
|
28
|
+
"for a given NLP task.")
|
29
|
+
subparsers = parser.add_subparsers(title='Tasks',
|
30
|
+
dest='task',
|
31
|
+
description='Task to train the Espresso for. '\
|
32
|
+
'Type %(prog)s [TASK] -h to get task-specific help.')
|
33
|
+
|
34
|
+
defaults = {}
|
35
|
+
|
36
|
+
# base parser with arguments not related to any model
|
37
|
+
base_parser = argparse.ArgumentParser(add_help=False)
|
38
|
+
base_parser.add_argument('-f', '--num_features', type=int,
|
39
|
+
help='Number of features per word '\
|
40
|
+
'(used to generate random vectors)',
|
41
|
+
default=50, dest='num_features')
|
42
|
+
base_parser.add_argument('--load_network', action='store_true',
|
43
|
+
help='Load previously saved network')
|
44
|
+
base_parser.add_argument('--load_features', action='store_true',
|
45
|
+
help="Load previously saved word type features "\
|
46
|
+
"(overrides -f and must also load a vocabulary file)",
|
47
|
+
dest='load_types')
|
48
|
+
base_parser.add_argument('-v', '--verbose', help='Verbose mode',
|
49
|
+
action="store_true")
|
50
|
+
base_parser.add_argument('--gold', help='File with annotated data for training.',
|
51
|
+
type=str, required=True)
|
52
|
+
base_parser.add_argument('--data', help='Directory to save new models and load '\
|
53
|
+
'partially trained ones (default: current directory)', type=str,
|
54
|
+
default='./data/')
|
55
|
+
base_parser.add_argument('--dev', help='Development (validation) data. If not given, '\
|
56
|
+
'training data will be used to evaluate performance.',
|
57
|
+
default=None)
|
58
|
+
|
59
|
+
# parser with network arguments shared among most tasks
|
60
|
+
# each task-specific parser may define defaults
|
61
|
+
network_parser = argparse.ArgumentParser(add_help=False, parents=[base_parser])
|
62
|
+
|
63
|
+
network_parser.add_argument('-w', '--window', type=int,
|
64
|
+
help='Size of the word window',
|
65
|
+
dest='window') # 반드시 홀수로 할 것
|
66
|
+
network_parser.add_argument('-e', '--epochs', type=int, dest='iterations',
|
67
|
+
help='Number of training epochs')
|
68
|
+
network_parser.add_argument('-l', '--learning_rate', type=float,
|
69
|
+
help='Learning rate for network connections',
|
70
|
+
dest='learning_rate')
|
71
|
+
network_parser.add_argument('--lf', type=float,
|
72
|
+
help='Learning rate for features',
|
73
|
+
dest='learning_rate_features')
|
74
|
+
network_parser.add_argument('--lt', type=float,
|
75
|
+
help='Learning rate for tag transitions',
|
76
|
+
dest='learning_rate_transitions')
|
77
|
+
network_parser.add_argument('--decay', type=float, const=1, nargs='?', default=None,
|
78
|
+
help='Use learning rate decay. Optionally, '\
|
79
|
+
'supply decay factor (default 1)')
|
80
|
+
network_parser.add_argument('-a', '--accuracy', type=float,
|
81
|
+
help='Maximum desired accuracy per token.',
|
82
|
+
default=0, dest='accuracy')
|
83
|
+
network_parser.add_argument('-n', '--hidden', type=int,
|
84
|
+
help='Number of hidden neurons',
|
85
|
+
dest='hidden')
|
86
|
+
network_parser.add_argument('--caps', const=5, nargs='?', type=int, default=None,
|
87
|
+
help='Include capitalization features. '\
|
88
|
+
'Optionally, supply the number of features (default 5)')
|
89
|
+
|
90
|
+
|
91
|
+
# parser with arguments shared among convolutional-based tasks
|
92
|
+
conv_parser = argparse.ArgumentParser(add_help=False)
|
93
|
+
conv_parser.add_argument('-c', '--convolution', type=int,
|
94
|
+
help='Number of convolution neurons',
|
95
|
+
dest='convolution')
|
96
|
+
conv_parser.add_argument('--pos', const=5, nargs='?', type=int, default=None,
|
97
|
+
help='Include part-of-speech features. '\
|
98
|
+
'Optionally, supply the number of features (default 5)')
|
99
|
+
conv_parser.add_argument('--max_dist', type=int, default=10,
|
100
|
+
help='Maximum distance to have its own feature vector')
|
101
|
+
conv_parser.add_argument('--target_features', type=int, default=5,
|
102
|
+
help='Number of features for distance to target word')
|
103
|
+
conv_parser.add_argument('--pred_features', type=int, default=5,
|
104
|
+
help='Number of features for distance to predicate')
|
105
|
+
|
106
|
+
|
107
|
+
# POS argument parser
|
108
|
+
parser_pos = subparsers.add_parser('pos', help='POS tagging',
|
109
|
+
parents=[network_parser])
|
110
|
+
parser_pos.add_argument('--suffix', const=2, nargs='?', type=int, default=None,
|
111
|
+
help='Include suffix features. Optionally, '\
|
112
|
+
'supply the number of features (default 2)')
|
113
|
+
parser_pos.add_argument('--suffix_size', type=int, default=5,
|
114
|
+
help='Use suffixes up to this size (in characters, default 5). '\
|
115
|
+
'Only used if --suffix is supplied')
|
116
|
+
parser_pos.add_argument('--prefix', const=2, nargs='?', type=int, default=None,
|
117
|
+
help='Include prefix features. Optionally, '\
|
118
|
+
'supply the number of features (default 2)')
|
119
|
+
parser_pos.add_argument('--prefix_size', type=int, default=5,
|
120
|
+
help='Use prefixes up to this size (in characters, default 5). '\
|
121
|
+
'Only used if --suffix is supplied')
|
122
|
+
defaults['pos'] = dict(window=9, hidden=300, iterations=20,
|
123
|
+
learning_rate=0.001, learning_rate_features=0.001,
|
124
|
+
learning_rate_transitions=0.001 )
|
125
|
+
|
126
|
+
# NER argument parser
|
127
|
+
parser_ner = subparsers.add_parser('ner', help='Named Entity Tagging',
|
128
|
+
parents=[network_parser])
|
129
|
+
parser_ner.add_argument('--suffix', const=2, nargs='?', type=int, default=None,
|
130
|
+
help='Include suffix features. Optionally, '\
|
131
|
+
'supply the number of features (default 2)')
|
132
|
+
parser_ner.add_argument('--suffix_size', type=int, default=5,
|
133
|
+
help='Use suffixes up to this size (in characters, default 5). '\
|
134
|
+
'Only used if --suffix is supplied')
|
135
|
+
parser_ner.add_argument('--prefix', const=2, nargs='?', type=int, default=None,
|
136
|
+
help='Include prefix features. Optionally, '\
|
137
|
+
'supply the number of features (default 2)')
|
138
|
+
parser_ner.add_argument('--prefix_size', type=int, default=5,
|
139
|
+
help='Use prefixes up to this size (in characters, default 5). '\
|
140
|
+
'Only used if --suffix is supplied')
|
141
|
+
defaults['ner'] = dict(window=5, hidden=300, iterations=20,
|
142
|
+
learning_rate=0.001, learning_rate_features=0.001,
|
143
|
+
learning_rate_transitions=0.001)
|
144
|
+
|
145
|
+
# WSD argument parser
|
146
|
+
parser_wsd = subparsers.add_parser('wsd', help='Word Sense Disambiguation',
|
147
|
+
parents=[network_parser])
|
148
|
+
parser_wsd.add_argument('--suffix', const=2, nargs='?', type=int, default=None,
|
149
|
+
help='Include suffix features. Optionally, '\
|
150
|
+
'supply the number of features (default 2)')
|
151
|
+
parser_wsd.add_argument('--suffix_size', type=int, default=5,
|
152
|
+
help='Use suffixes up to this size (in characters, default 5). '\
|
153
|
+
'Only used if --suffix is supplied')
|
154
|
+
parser_wsd.add_argument('--prefix', const=2, nargs='?', type=int, default=None,
|
155
|
+
help='Include prefix features. Optionally, '\
|
156
|
+
'supply the number of features (default 2)')
|
157
|
+
parser_wsd.add_argument('--prefix_size', type=int, default=5,
|
158
|
+
help='Use prefixes up to this size (in characters, default 5). '\
|
159
|
+
'Only used if --suffix is supplied')
|
160
|
+
defaults['wsd'] = dict(window=5, hidden=300, iterations=20,
|
161
|
+
learning_rate=0.001, learning_rate_features=0.001,
|
162
|
+
learning_rate_transitions=0.001)
|
163
|
+
|
164
|
+
# Chunk argument parser
|
165
|
+
#parser_ner = subparsers.add_parser('ner', help='Named Entity Tagging',
|
166
|
+
# parents=[network_parser])
|
167
|
+
#parser_ner.add_argument('--suffix', const=2, nargs='?', type=int, default=None,
|
168
|
+
# help='Include suffix features. Optionally, '\
|
169
|
+
# 'supply the number of features (default 2)')
|
170
|
+
#parser_ner.add_argument('--suffix_size', type=int, default=5,
|
171
|
+
# help='Use suffixes up to this size (in characters, default 5). '\
|
172
|
+
# 'Only used if --suffix is supplied')
|
173
|
+
#parser_ner.add_argument('--prefix', const=2, nargs='?', type=int, default=None,
|
174
|
+
# help='Include prefix features. Optionally, '\
|
175
|
+
# 'supply the number of features (default 2)')
|
176
|
+
#parser_ner.add_argument('--prefix_size', type=int, default=5,
|
177
|
+
# help='Use prefixes up to this size (in characters, default 5). '\
|
178
|
+
# 'Only used if --suffix is supplied')
|
179
|
+
#defaults['ner'] = dict(window=7, hidden=300, iterations=20,
|
180
|
+
# learning_rate=0.001, learning_rate_features=0.001,
|
181
|
+
# learning_rate_transitions=0.001)
|
182
|
+
|
183
|
+
|
184
|
+
# dependency
|
185
|
+
parser_dep = subparsers.add_parser('dependency', help='Dependency parsing')
|
186
|
+
dep_subparsers = parser_dep.add_subparsers(title='Dependency parsing training steps',
|
187
|
+
dest='subtask',
|
188
|
+
description='Which step of the dependency training '\
|
189
|
+
'(detecting edges or labeling them)')
|
190
|
+
|
191
|
+
dep_subparsers.add_parser('labeled', help='Labeling dependency edges',
|
192
|
+
parents=[network_parser, conv_parser])
|
193
|
+
dep_subparsers.add_parser('unlabeled', help='Dependency edge detection',
|
194
|
+
parents=[network_parser, conv_parser])
|
195
|
+
|
196
|
+
defaults['dependency_filter'] = dict()
|
197
|
+
defaults['labeled_dependency'] = dict(window=9, hidden=200, iterations=20,
|
198
|
+
learning_rate=0.001, learning_rate_features=0.001,
|
199
|
+
convolution=300, pos=50,
|
200
|
+
learning_rate_transitions=0.001)
|
201
|
+
defaults['unlabeled_dependency'] = dict(window=9, hidden=200, iterations=20,
|
202
|
+
learning_rate=0.001, learning_rate_features=0.001,
|
203
|
+
convolution=300, pos=50,
|
204
|
+
learning_rate_transitions=0.001)
|
205
|
+
|
206
|
+
|
207
|
+
# SRL argument parser
|
208
|
+
# There is another level of subparsers for predicate detection /
|
209
|
+
# argument boundary identification / argument classification /
|
210
|
+
# (id + class) in one step
|
211
|
+
parser_srl = subparsers.add_parser('srl', help='Semantic Role Labeling',
|
212
|
+
parents=[network_parser, conv_parser],
|
213
|
+
formatter_class=argparse.RawDescriptionHelpFormatter)
|
214
|
+
#parser_srl.set_defaults(identify=False, predicates=False, classify=False)
|
215
|
+
|
216
|
+
desc = '''SRL has 3 steps: predicate detection, argument identification and
|
217
|
+
argument classification. Each one depends on the one before.
|
218
|
+
|
219
|
+
You need one model trained for each subtask (or one for predicate
|
220
|
+
detection and another with the other 2 steps) in order to perform
|
221
|
+
full SRL.
|
222
|
+
|
223
|
+
Type %(prog)s [SUBTASK] -h to get subtask-specific help.'''
|
224
|
+
|
225
|
+
'''
|
226
|
+
srl_subparsers = parser_srl.add_subparsers(title='SRL subtasks',
|
227
|
+
dest='subtask',
|
228
|
+
description=desc)
|
229
|
+
srl_subparsers.add_parser('pred', help='Predicate identification',
|
230
|
+
parents=[network_parser])
|
231
|
+
defaults['srl_predicates'] = dict(window=5, hidden=50, iterations=1,
|
232
|
+
learning_rate=0.01, learning_rate_features=0.01,
|
233
|
+
learning_rate_transitions=0.01,
|
234
|
+
predicates=True)
|
235
|
+
|
236
|
+
srl_subparsers.add_parser('id', help='Argument identification',
|
237
|
+
parents=[network_parser, conv_parser])
|
238
|
+
defaults['srl_boundary'] = dict(window=3, hidden=150, convolution=150,
|
239
|
+
identify=True, iterations=15,
|
240
|
+
learning_rate=0.001, learning_rate_features=0.001,
|
241
|
+
learning_rate_transitions=0.001)
|
242
|
+
|
243
|
+
srl_subparsers.add_parser('class', help='Argument classification',
|
244
|
+
parents=[network_parser, conv_parser])
|
245
|
+
defaults['srl_classify'] = dict(window=3, hidden=0, convolution=100,
|
246
|
+
classify=True, iterations=3,
|
247
|
+
learning_rate=0.01, learning_rate_features=0.01,
|
248
|
+
learning_rate_transitions=0.01)
|
249
|
+
srl_subparsers.add_parser('1step', parents=[network_parser, conv_parser],
|
250
|
+
help='Argument identification and '\
|
251
|
+
'classification together')
|
252
|
+
'''
|
253
|
+
defaults['srl'] = dict(window=3, hidden=150, convolution=200, iterations=8,
|
254
|
+
pos = 5,
|
255
|
+
learning_rate=0.001, learning_rate_features=0.001,
|
256
|
+
learning_rate_transitions=0.001)
|
257
|
+
|
258
|
+
|
259
|
+
args = parser.parse_args()
|
260
|
+
|
261
|
+
#if args.task == 'srl':
|
262
|
+
# if args.subtask == 'class':
|
263
|
+
# args.task = 'srl_classify'
|
264
|
+
# args.classify = True
|
265
|
+
# elif args.subtask == 'id':
|
266
|
+
# args.task = 'srl_boundary'
|
267
|
+
# args.identify = True
|
268
|
+
# elif args.subtask == 'pred':
|
269
|
+
# args.task = 'srl_predicates'
|
270
|
+
# args.predicates = True
|
271
|
+
if args.task == 'dependency':
|
272
|
+
if args.subtask == 'labeled':
|
273
|
+
args.task = 'labeled_dependency'
|
274
|
+
args.labeled = True
|
275
|
+
elif args.subtask == 'unlabeled':
|
276
|
+
args.task = 'unlabeled_dependency'
|
277
|
+
args.labeled = False
|
278
|
+
|
279
|
+
fill_defaults(args, defaults)
|
280
|
+
return args
|