arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/data/storages/base.py +4 -15
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +27 -22
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +39 -2
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +11 -52
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- arekit/contrib/utils/data/storages/row_cache.py +8 -2
- arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
- arekit-0.25.1.data/data/logo.png +0 -0
- arekit-0.25.1.dist-info/METADATA +81 -0
- arekit-0.25.1.dist-info/RECORD +186 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/serializer.py +0 -43
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- arekit-0.24.0.dist-info/RECORD +0 -374
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from arekit.contrib.utils.data.readers.base import BaseReader
|
|
2
|
-
from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class JsonlReader(BaseReader):
|
|
6
|
-
|
|
7
|
-
def extension(self):
|
|
8
|
-
return ".jsonl"
|
|
9
|
-
|
|
10
|
-
def read(self, target):
|
|
11
|
-
rows = []
|
|
12
|
-
with open(target, "r") as f:
|
|
13
|
-
for line in f.readlines():
|
|
14
|
-
rows.append(line)
|
|
15
|
-
return JsonlBasedRowsStorage(rows)
|
|
File without changes
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
import gc
|
|
2
|
-
import importlib
|
|
3
|
-
from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class PandasBasedStorageBalancing(object):
|
|
7
|
-
|
|
8
|
-
@staticmethod
|
|
9
|
-
def create_balanced_from(storage, column_name, free_origin=True):
|
|
10
|
-
""" Performs oversampled balancing.
|
|
11
|
-
|
|
12
|
-
Note: it is quite important to remove previously created storage
|
|
13
|
-
in order to avoid memory leaking.
|
|
14
|
-
|
|
15
|
-
storage: PandasBasedRowsStorage
|
|
16
|
-
storage contents to be balanced.
|
|
17
|
-
|
|
18
|
-
column_name: str
|
|
19
|
-
column utilized for balancing.
|
|
20
|
-
|
|
21
|
-
free_origin: bool
|
|
22
|
-
indicates whether there is a need to release the resources
|
|
23
|
-
utilized for the original storage.
|
|
24
|
-
"""
|
|
25
|
-
assert(isinstance(storage, PandasBasedRowsStorage))
|
|
26
|
-
|
|
27
|
-
original_df = storage.DataFrame
|
|
28
|
-
|
|
29
|
-
max_size = original_df[column_name].value_counts().max()
|
|
30
|
-
|
|
31
|
-
dframes = []
|
|
32
|
-
for class_index, group in original_df.groupby(column_name):
|
|
33
|
-
dframes.append(group.sample(max_size - len(group), replace=True))
|
|
34
|
-
|
|
35
|
-
# Clear resources.
|
|
36
|
-
pd = importlib.import_module("pandas")
|
|
37
|
-
balanced_df = pd.concat(dframes + [original_df])
|
|
38
|
-
|
|
39
|
-
# Removing temporary created dataframe.
|
|
40
|
-
for df in dframes:
|
|
41
|
-
del df
|
|
42
|
-
|
|
43
|
-
# Marking the original dataframe as released
|
|
44
|
-
# in terms of the allocated memory for it.
|
|
45
|
-
if free_origin:
|
|
46
|
-
storage.free()
|
|
47
|
-
|
|
48
|
-
gc.collect()
|
|
49
|
-
|
|
50
|
-
return PandasBasedRowsStorage(df=balanced_df)
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
import csv
|
|
2
|
-
import os
|
|
3
|
-
from os.path import dirname
|
|
4
|
-
|
|
5
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
6
|
-
from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
|
|
7
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class NativeCsvWriter(BaseWriter):
|
|
11
|
-
|
|
12
|
-
def __init__(self, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL, header=True):
|
|
13
|
-
self.__target_f = None
|
|
14
|
-
self.__writer = None
|
|
15
|
-
self.__create_writer_func = lambda f: csv.writer(
|
|
16
|
-
f, delimiter=delimiter, quotechar=quotechar, quoting=quoting)
|
|
17
|
-
self.__header = header
|
|
18
|
-
self.__header_written = None
|
|
19
|
-
|
|
20
|
-
def extension(self):
|
|
21
|
-
return ".csv"
|
|
22
|
-
|
|
23
|
-
@staticmethod
|
|
24
|
-
def __iter_storage_column_names(storage):
|
|
25
|
-
""" Iter only those columns that existed in storage.
|
|
26
|
-
"""
|
|
27
|
-
for col_name in storage.iter_column_names():
|
|
28
|
-
if col_name in storage.RowCache:
|
|
29
|
-
yield col_name
|
|
30
|
-
|
|
31
|
-
def open_target(self, target):
|
|
32
|
-
os.makedirs(dirname(target), exist_ok=True)
|
|
33
|
-
self.__target_f = open(target, "w")
|
|
34
|
-
self.__writer = self.__create_writer_func(self.__target_f)
|
|
35
|
-
self.__header_written = not self.__header
|
|
36
|
-
|
|
37
|
-
def close_target(self):
|
|
38
|
-
self.__target_f.close()
|
|
39
|
-
|
|
40
|
-
def commit_line(self, storage):
|
|
41
|
-
assert(isinstance(storage, RowCacheStorage))
|
|
42
|
-
assert(self.__writer is not None)
|
|
43
|
-
|
|
44
|
-
if not self.__header_written:
|
|
45
|
-
self.__writer.writerow(list(self.__iter_storage_column_names(storage)))
|
|
46
|
-
self.__header_written = True
|
|
47
|
-
|
|
48
|
-
line_data = list(map(lambda col_name: storage.RowCache[col_name],
|
|
49
|
-
self.__iter_storage_column_names(storage)))
|
|
50
|
-
self.__writer.writerow(line_data)
|
|
51
|
-
|
|
52
|
-
def write_all(self, storage, target):
|
|
53
|
-
""" Writes all the `storage` rows
|
|
54
|
-
into the `target` filepath, formatted as CSV.
|
|
55
|
-
"""
|
|
56
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
57
|
-
|
|
58
|
-
with open(target, "w") as f:
|
|
59
|
-
writer = self.__create_writer_func(f)
|
|
60
|
-
for _, row in storage:
|
|
61
|
-
#content = [row[col_name] for col_name in storage.iter_column_names()]
|
|
62
|
-
content = [v for v in row]
|
|
63
|
-
writer.writerow(content)
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
|
|
4
|
-
from arekit.common.utils import create_dir_if_not_exists
|
|
5
|
-
from arekit.contrib.utils.data.storages.pandas_based import PandasBasedRowsStorage
|
|
6
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
7
|
-
|
|
8
|
-
logger = logging.getLogger(__name__)
|
|
9
|
-
logging.basicConfig(level=logging.INFO)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class PandasCsvWriter(BaseWriter):
|
|
13
|
-
|
|
14
|
-
def __init__(self, write_header):
|
|
15
|
-
super(PandasCsvWriter, self).__init__()
|
|
16
|
-
self.__write_header = write_header
|
|
17
|
-
|
|
18
|
-
def extension(self):
|
|
19
|
-
return ".tsv.gz"
|
|
20
|
-
|
|
21
|
-
def write_all(self, storage, target):
|
|
22
|
-
assert(isinstance(storage, PandasBasedRowsStorage))
|
|
23
|
-
assert(isinstance(target, str))
|
|
24
|
-
|
|
25
|
-
create_dir_if_not_exists(target)
|
|
26
|
-
|
|
27
|
-
# Temporary hack, remove it in future.
|
|
28
|
-
df = storage.DataFrame
|
|
29
|
-
|
|
30
|
-
logger.info("Saving... {length}: {filepath}".format(length=len(storage), filepath=target))
|
|
31
|
-
df.to_csv(target,
|
|
32
|
-
sep='\t',
|
|
33
|
-
encoding='utf-8',
|
|
34
|
-
columns=[c for c in df.columns if c != BaseColumnsProvider.ROW_ID],
|
|
35
|
-
index=False,
|
|
36
|
-
float_format="%.0f",
|
|
37
|
-
compression='gzip',
|
|
38
|
-
header=self.__write_header)
|
|
39
|
-
|
|
40
|
-
logger.info("Saving completed!")
|
|
@@ -1,132 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
from os.path import dirname
|
|
5
|
-
|
|
6
|
-
from arekit.common.data import const
|
|
7
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
8
|
-
from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
|
|
9
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class OpenNREJsonWriter(BaseWriter):
|
|
15
|
-
""" This is a bag-based writer for the samples.
|
|
16
|
-
Project page: https://github.com/thunlp/OpenNRE
|
|
17
|
-
|
|
18
|
-
Every bag presented as follows:
|
|
19
|
-
{
|
|
20
|
-
'text' or 'token': ...,
|
|
21
|
-
'h': {'pos': [start, end], 'id': ... },
|
|
22
|
-
't': {'pos': [start, end], 'id': ... }
|
|
23
|
-
'id': "id_of_the_text_opinion"
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
In terms of the linked opinions (i0, i1, etc.) we consider id of the first opinion in linkage.
|
|
27
|
-
During the dataset reading stage via OpenNRE, these linkages automaticaly groups into bags.
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
def __init__(self, text_columns, encoding="utf-8", na_value="NA", keep_extra_columns=True,
|
|
31
|
-
skip_extra_existed=True):
|
|
32
|
-
""" text_columns: list
|
|
33
|
-
column names that expected to be joined into a single (token) column.
|
|
34
|
-
"""
|
|
35
|
-
assert(isinstance(text_columns, list))
|
|
36
|
-
assert(isinstance(encoding, str))
|
|
37
|
-
self.__text_columns = text_columns
|
|
38
|
-
self.__encoding = encoding
|
|
39
|
-
self.__target_f = None
|
|
40
|
-
self.__keep_extra_columns = keep_extra_columns
|
|
41
|
-
self.__na_value = na_value
|
|
42
|
-
self.__skip_extra_existed = skip_extra_existed
|
|
43
|
-
|
|
44
|
-
def extension(self):
|
|
45
|
-
return ".jsonl"
|
|
46
|
-
|
|
47
|
-
@staticmethod
|
|
48
|
-
def __format_row(row, na_value, text_columns, keep_extra_columns, skip_extra_existed):
|
|
49
|
-
""" Formatting that is compatible with the OpenNRE.
|
|
50
|
-
"""
|
|
51
|
-
assert(isinstance(na_value, str))
|
|
52
|
-
|
|
53
|
-
sample_id = row[const.ID]
|
|
54
|
-
s_ind = int(row[const.S_IND])
|
|
55
|
-
t_ind = int(row[const.T_IND])
|
|
56
|
-
bag_id = str(row[const.OPINION_ID])
|
|
57
|
-
|
|
58
|
-
# Gather tokens.
|
|
59
|
-
tokens = []
|
|
60
|
-
for text_col in text_columns:
|
|
61
|
-
if text_col in row:
|
|
62
|
-
tokens.extend(row[text_col].split())
|
|
63
|
-
|
|
64
|
-
# Filtering JSON row.
|
|
65
|
-
formatted_data = {
|
|
66
|
-
"id": bag_id,
|
|
67
|
-
"id_orig": sample_id,
|
|
68
|
-
"token": tokens,
|
|
69
|
-
"h": {"pos": [s_ind, s_ind + 1], "id": str(bag_id + "s")},
|
|
70
|
-
"t": {"pos": [t_ind, t_ind + 1], "id": str(bag_id + "t")},
|
|
71
|
-
"relation": str(int(row[const.LABEL_UINT])) if const.LABEL_UINT in row else na_value
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
# Register extra fields (optionally).
|
|
75
|
-
if keep_extra_columns:
|
|
76
|
-
for key, value in row.items():
|
|
77
|
-
if key not in formatted_data and key not in text_columns:
|
|
78
|
-
formatted_data[key] = value
|
|
79
|
-
else:
|
|
80
|
-
if not skip_extra_existed:
|
|
81
|
-
raise Exception(f"key `{key}` is already exist in formatted data "
|
|
82
|
-
f"or a part of the text columns list: {text_columns}")
|
|
83
|
-
|
|
84
|
-
return formatted_data
|
|
85
|
-
|
|
86
|
-
def open_target(self, target):
|
|
87
|
-
os.makedirs(dirname(target), exist_ok=True)
|
|
88
|
-
self.__target_f = open(target, "w")
|
|
89
|
-
pass
|
|
90
|
-
|
|
91
|
-
def close_target(self):
|
|
92
|
-
self.__target_f.close()
|
|
93
|
-
|
|
94
|
-
def commit_line(self, storage):
|
|
95
|
-
assert(isinstance(storage, RowCacheStorage))
|
|
96
|
-
|
|
97
|
-
# Collect existed columns.
|
|
98
|
-
row_data = {}
|
|
99
|
-
for col_name in storage.iter_column_names():
|
|
100
|
-
if col_name not in storage.RowCache:
|
|
101
|
-
continue
|
|
102
|
-
row_data[col_name] = storage.RowCache[col_name]
|
|
103
|
-
|
|
104
|
-
bag = self.__format_row(row_data, text_columns=self.__text_columns,
|
|
105
|
-
keep_extra_columns=self.__keep_extra_columns,
|
|
106
|
-
na_value=self.__na_value,
|
|
107
|
-
skip_extra_existed=self.__skip_extra_existed)
|
|
108
|
-
|
|
109
|
-
self.__write_bag(bag=bag, json_file=self.__target_f)
|
|
110
|
-
|
|
111
|
-
@staticmethod
|
|
112
|
-
def __write_bag(bag, json_file):
|
|
113
|
-
assert(isinstance(bag, dict))
|
|
114
|
-
json.dump(bag, json_file, separators=(",", ":"), ensure_ascii=False)
|
|
115
|
-
json_file.write("\n")
|
|
116
|
-
|
|
117
|
-
def write_all(self, storage, target):
|
|
118
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
119
|
-
assert(isinstance(target, str))
|
|
120
|
-
|
|
121
|
-
logger.info("Saving... {rows}: {filepath}".format(rows=(len(storage)), filepath=target))
|
|
122
|
-
|
|
123
|
-
os.makedirs(os.path.dirname(target), exist_ok=True)
|
|
124
|
-
with open(target, "w", encoding=self.__encoding) as json_file:
|
|
125
|
-
for row_index, row in storage:
|
|
126
|
-
self.__write_bag(bag=self.__format_row(row, text_columns=self.__text_columns,
|
|
127
|
-
keep_extra_columns=self.__keep_extra_columns,
|
|
128
|
-
na_value=self.__na_value,
|
|
129
|
-
skip_extra_existed=self.__skip_extra_existed),
|
|
130
|
-
json_file=json_file)
|
|
131
|
-
|
|
132
|
-
logger.info("Saving completed!")
|
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sqlite3
|
|
3
|
-
from os.path import dirname
|
|
4
|
-
|
|
5
|
-
from arekit.common.data import const
|
|
6
|
-
from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
|
|
7
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class SQliteWriter(BaseWriter):
|
|
11
|
-
|
|
12
|
-
def __init__(self, table_name="contents", index_column_names=None, skip_existed=False, clear_table=True):
|
|
13
|
-
""" index_column_names: list or None
|
|
14
|
-
column names should be considered to build a unique index;
|
|
15
|
-
if None, the default 'const.ID' will be considered for row indexation.
|
|
16
|
-
"""
|
|
17
|
-
assert (isinstance(index_column_names, list) or index_column_names is None)
|
|
18
|
-
self.__index_column_names = index_column_names if index_column_names is not None else [const.ID]
|
|
19
|
-
self.__table_name = table_name
|
|
20
|
-
self.__conn = None
|
|
21
|
-
self.__cur = None
|
|
22
|
-
self.__need_init_table = True
|
|
23
|
-
self.__origin_column_names = None
|
|
24
|
-
self.__skip_existed = skip_existed
|
|
25
|
-
self.__clear_table = clear_table
|
|
26
|
-
|
|
27
|
-
def extension(self):
|
|
28
|
-
return ".sqlite"
|
|
29
|
-
|
|
30
|
-
@staticmethod
|
|
31
|
-
def __iter_storage_column_names(storage):
|
|
32
|
-
""" Iter only those columns that existed in storage.
|
|
33
|
-
"""
|
|
34
|
-
assert (isinstance(storage, RowCacheStorage))
|
|
35
|
-
for col_name, col_type in zip(storage.iter_column_names(), storage.iter_column_types()):
|
|
36
|
-
if col_name in storage.RowCache:
|
|
37
|
-
yield col_name, col_type
|
|
38
|
-
|
|
39
|
-
def __init_table(self, column_data):
|
|
40
|
-
# Compose column name with the related SQLITE type.
|
|
41
|
-
column_types = ",".join([" ".join([col_name, self.type_to_sqlite(col_type)])
|
|
42
|
-
for col_name, col_type in column_data])
|
|
43
|
-
# Create table if not exists.
|
|
44
|
-
self.__cur.execute(f"CREATE TABLE IF NOT EXISTS {self.__table_name}({column_types})")
|
|
45
|
-
# Table exists, however we may optionally remove the content from it.
|
|
46
|
-
if self.__clear_table:
|
|
47
|
-
self.__cur.execute(f"DELETE FROM {self.__table_name};")
|
|
48
|
-
# Create index.
|
|
49
|
-
index_name = f"i_{self.__table_name}_id"
|
|
50
|
-
self.__cur.execute(f"DROP INDEX IF EXISTS {index_name};")
|
|
51
|
-
self.__cur.execute("CREATE INDEX IF NOT EXISTS {index} ON {table}({columns})".format(
|
|
52
|
-
index=index_name,
|
|
53
|
-
table=self.__table_name,
|
|
54
|
-
columns=", ".join(self.__index_column_names)
|
|
55
|
-
))
|
|
56
|
-
self.__origin_column_names = [col_name for col_name, _ in column_data]
|
|
57
|
-
|
|
58
|
-
@staticmethod
|
|
59
|
-
def type_to_sqlite(col_type):
|
|
60
|
-
""" This is a simple function that provides conversion from the
|
|
61
|
-
base numpy types to SQLITE.
|
|
62
|
-
NOTE: this method represent a quick implementation for supporting
|
|
63
|
-
types, however it is far away from the generalized implementation.
|
|
64
|
-
"""
|
|
65
|
-
if isinstance(col_type, str):
|
|
66
|
-
if 'int' in col_type:
|
|
67
|
-
return 'INTEGER'
|
|
68
|
-
|
|
69
|
-
return "TEXT"
|
|
70
|
-
|
|
71
|
-
def open_target(self, target):
|
|
72
|
-
os.makedirs(dirname(target), exist_ok=True)
|
|
73
|
-
self.__conn = sqlite3.connect(target)
|
|
74
|
-
self.__cur = self.__conn.cursor()
|
|
75
|
-
|
|
76
|
-
def commit_line(self, storage):
|
|
77
|
-
assert (isinstance(storage, RowCacheStorage))
|
|
78
|
-
|
|
79
|
-
column_data = list(self.__iter_storage_column_names(storage))
|
|
80
|
-
|
|
81
|
-
if self.__need_init_table:
|
|
82
|
-
self.__init_table(column_data)
|
|
83
|
-
self.__need_init_table = False
|
|
84
|
-
|
|
85
|
-
# Check whether the related row is already exist in SQLITE database.
|
|
86
|
-
row_id = storage.RowCache[const.ID]
|
|
87
|
-
top_row = self.__cur.execute(f"SELECT EXISTS(SELECT 1 FROM {self.__table_name} WHERE id='{row_id}');")
|
|
88
|
-
is_exists = top_row.fetchone()[0]
|
|
89
|
-
if is_exists == 1 and self.__skip_existed:
|
|
90
|
-
return
|
|
91
|
-
|
|
92
|
-
line_data = [storage.RowCache[col_name] for col_name, _ in column_data]
|
|
93
|
-
parameters = ",".join(["?"] * len(line_data))
|
|
94
|
-
|
|
95
|
-
assert (len(self.__origin_column_names) == len(line_data))
|
|
96
|
-
|
|
97
|
-
self.__cur.execute(
|
|
98
|
-
f"INSERT OR REPLACE INTO {self.__table_name} VALUES ({parameters})",
|
|
99
|
-
tuple(line_data))
|
|
100
|
-
|
|
101
|
-
self.__conn.commit()
|
|
102
|
-
|
|
103
|
-
def close_target(self):
|
|
104
|
-
self.__cur = None
|
|
105
|
-
self.__origin_column_names = None
|
|
106
|
-
self.__need_init_table = True
|
|
107
|
-
self.__conn.close()
|
|
108
|
-
|
|
109
|
-
def write_all(self, storage, target):
|
|
110
|
-
pass
|
arekit/contrib/utils/download.py
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import tarfile
|
|
3
|
-
from os.path import join, exists
|
|
4
|
-
|
|
5
|
-
from arekit.common import utils
|
|
6
|
-
from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
|
|
7
|
-
from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
|
|
8
|
-
|
|
9
|
-
NEWS_MYSTEM_SKIPGRAM_1000_20_2015 = "news_mystem_skipgram_1000_20_2015.tar.gz"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def __get_resource(local_name, check_existance=False, download_if_missed=False):
|
|
13
|
-
assert (isinstance(local_name, str))
|
|
14
|
-
filepath = join(utils.get_default_download_dir(), local_name)
|
|
15
|
-
|
|
16
|
-
if check_existance and not exists(filepath):
|
|
17
|
-
if download_if_missed:
|
|
18
|
-
download()
|
|
19
|
-
# We try to ger the resource again but won't attempt to download it again.
|
|
20
|
-
__get_resource(local_name, check_existance=check_existance, download_if_missed=False)
|
|
21
|
-
else:
|
|
22
|
-
raise Exception("Resource could not be found: {}".format(filepath))
|
|
23
|
-
|
|
24
|
-
return filepath
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def __get_embedding_dir(filepath):
|
|
28
|
-
return filepath.replace(".tar.gz", "")
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def load_embedding_and_vocab(local_name, check_existance=False, download_if_missed=False):
|
|
32
|
-
tar_gz_archive = __get_resource(local_name, check_existance=check_existance,
|
|
33
|
-
download_if_missed=download_if_missed)
|
|
34
|
-
target_dir = __get_embedding_dir(tar_gz_archive)
|
|
35
|
-
embedding = NpzEmbeddingHelper.load_embedding(os.path.join(target_dir, "embedding.npz"))
|
|
36
|
-
vocab = VocabRepositoryUtils.load(os.path.join(target_dir, "vocab.txt"))
|
|
37
|
-
return embedding, vocab
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def download():
|
|
41
|
-
data = {
|
|
42
|
-
NEWS_MYSTEM_SKIPGRAM_1000_20_2015: "https://www.dropbox.com/s/0omnlgzgnjhxlmf/{filename}?dl=1".format(
|
|
43
|
-
filename=NEWS_MYSTEM_SKIPGRAM_1000_20_2015),
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
# Perform downloading ...
|
|
47
|
-
for local_name, url_link in data.items():
|
|
48
|
-
utils.download(dest_file_path=__get_resource(local_name),
|
|
49
|
-
source_url=url_link)
|
|
50
|
-
|
|
51
|
-
# Untar files ...
|
|
52
|
-
for local_name in data.keys():
|
|
53
|
-
|
|
54
|
-
if ".tar.gz" not in local_name:
|
|
55
|
-
continue
|
|
56
|
-
|
|
57
|
-
target_filepath = __get_resource(local_name)
|
|
58
|
-
with tarfile.open(target_filepath) as file:
|
|
59
|
-
def is_within_directory(directory, target):
|
|
60
|
-
|
|
61
|
-
abs_directory = os.path.abspath(directory)
|
|
62
|
-
abs_target = os.path.abspath(target)
|
|
63
|
-
|
|
64
|
-
prefix = os.path.commonprefix([abs_directory, abs_target])
|
|
65
|
-
|
|
66
|
-
return prefix == abs_directory
|
|
67
|
-
|
|
68
|
-
def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
|
|
69
|
-
|
|
70
|
-
for member in tar.getmembers():
|
|
71
|
-
member_path = os.path.join(path, member.name)
|
|
72
|
-
if not is_within_directory(path, member_path):
|
|
73
|
-
raise Exception("Attempted Path Traversal in Tar File")
|
|
74
|
-
|
|
75
|
-
tar.extractall(path, members, numeric_owner=numeric_owner)
|
|
76
|
-
|
|
77
|
-
safe_extract(file, __get_embedding_dir(target_filepath))
|
|
File without changes
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
from arekit.common.text.stemmer import Stemmer
|
|
2
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RusvectoresEmbedding(Embedding):
|
|
6
|
-
""" Wrapper over models from the following resource.
|
|
7
|
-
https://rusvectores.org/ru/models/
|
|
8
|
-
|
|
9
|
-
NOTE: Usually these are embeddings for texts written in Russian.
|
|
10
|
-
for the better performance it is expected that we adopt stemmer.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
def __init__(self, matrix, words, stemmer):
|
|
14
|
-
assert(isinstance(stemmer, Stemmer) or stemmer is None)
|
|
15
|
-
super(RusvectoresEmbedding, self).__init__(matrix=matrix, words=words)
|
|
16
|
-
self.__index_without_pos = self.__create_terms_without_pos()
|
|
17
|
-
self.__stemmer = stemmer
|
|
18
|
-
self.__lemmatize_by_default = stemmer is not None
|
|
19
|
-
|
|
20
|
-
def try_find_index_by_plain_word(self, word):
|
|
21
|
-
assert(isinstance(word, str))
|
|
22
|
-
|
|
23
|
-
temp = self.__lemmatize_by_default
|
|
24
|
-
self.__lemmatize_by_default = False
|
|
25
|
-
index = super(RusvectoresEmbedding, self).try_find_index_by_plain_word(word)
|
|
26
|
-
self.__lemmatize_by_default = temp
|
|
27
|
-
|
|
28
|
-
return index
|
|
29
|
-
|
|
30
|
-
def _handler(self, word):
|
|
31
|
-
return self.__try_find_word_index_pair_lemmatized(word, self.__lemmatize_by_default)
|
|
32
|
-
|
|
33
|
-
# region private methods
|
|
34
|
-
|
|
35
|
-
def __try_find_word_index_pair_lemmatized(self, term, lemmatize):
|
|
36
|
-
assert(isinstance(term, str))
|
|
37
|
-
assert(isinstance(lemmatize, bool))
|
|
38
|
-
|
|
39
|
-
if lemmatize:
|
|
40
|
-
term = self.__stemmer.lemmatize_to_str(term)
|
|
41
|
-
|
|
42
|
-
index = self.__index_without_pos[term] \
|
|
43
|
-
if term in self.__index_without_pos else None
|
|
44
|
-
|
|
45
|
-
return term, index
|
|
46
|
-
|
|
47
|
-
def __create_terms_without_pos(self):
|
|
48
|
-
d = {}
|
|
49
|
-
for word_with_pos, index in self.iter_vocabulary():
|
|
50
|
-
assert(isinstance(word_with_pos, str))
|
|
51
|
-
word = word_with_pos.split(u'_')[0]
|
|
52
|
-
if word in d:
|
|
53
|
-
continue
|
|
54
|
-
d[word] = index
|
|
55
|
-
|
|
56
|
-
return d
|
|
57
|
-
|
|
58
|
-
# endregion
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.networks.embedding import Embedding
|
|
4
|
-
from arekit.contrib.utils.processing.text.tokens import Tokens
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class TokenEmbedding(Embedding):
|
|
8
|
-
""" Embedding vectors for text punctuation, based on Tokens in parsed text
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
@classmethod
|
|
12
|
-
def from_supported_tokens(cls, vector_size, random_vector_func):
|
|
13
|
-
"""
|
|
14
|
-
random_vector_func: func
|
|
15
|
-
function with parameters (vector_size, seed)
|
|
16
|
-
"""
|
|
17
|
-
assert(isinstance(vector_size, int))
|
|
18
|
-
assert(callable(random_vector_func))
|
|
19
|
-
|
|
20
|
-
matrix = []
|
|
21
|
-
tokens_list = list(Tokens.iter_supported_tokens())
|
|
22
|
-
|
|
23
|
-
for token_index, _ in enumerate(tokens_list):
|
|
24
|
-
|
|
25
|
-
vector = random_vector_func(vector_size, token_index)
|
|
26
|
-
|
|
27
|
-
matrix.append(vector)
|
|
28
|
-
|
|
29
|
-
return cls(matrix=np.array(matrix),
|
|
30
|
-
words=tokens_list)
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from os.path import join
|
|
2
|
-
|
|
3
|
-
from arekit.contrib.networks.embedding_io import BaseEmbeddingIO
|
|
4
|
-
from arekit.contrib.utils.io_utils.utils import check_targets_existence
|
|
5
|
-
from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
|
|
6
|
-
from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NpEmbeddingIO(BaseEmbeddingIO):
|
|
10
|
-
""" Npz-based IO utils for embedding and text-based for vocabulary.
|
|
11
|
-
This format represents a archived version of the numpy math data, i.e. vectors, numbers, etc.
|
|
12
|
-
|
|
13
|
-
Provides additional Input/Output paths generation functions for:
|
|
14
|
-
- embedding matrix;
|
|
15
|
-
- embedding vocabulary.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
def __init__(self, target_dir, prefix_name="sample"):
|
|
19
|
-
assert(isinstance(target_dir, str))
|
|
20
|
-
|
|
21
|
-
self.__target_dir = target_dir
|
|
22
|
-
self.__term_emb_fn_template = "-".join([prefix_name, "term_embedding"])
|
|
23
|
-
self.__vocab_fn_template = "-".join([prefix_name, "term_embedding"])
|
|
24
|
-
|
|
25
|
-
# region Embedding-related data
|
|
26
|
-
|
|
27
|
-
def save_vocab(self, data):
|
|
28
|
-
target = self.__get_default_vocab_filepath()
|
|
29
|
-
return VocabRepositoryUtils.save(data=data, target=target)
|
|
30
|
-
|
|
31
|
-
def load_vocab(self):
|
|
32
|
-
source = self.___get_vocab_source()
|
|
33
|
-
return dict(VocabRepositoryUtils.load(source))
|
|
34
|
-
|
|
35
|
-
def save_embedding(self, data):
|
|
36
|
-
target = self.__get_default_embedding_filepath()
|
|
37
|
-
NpzEmbeddingHelper.save_embedding(data=data, target=target)
|
|
38
|
-
|
|
39
|
-
def load_embedding(self):
|
|
40
|
-
source = self.__get_term_embedding_source()
|
|
41
|
-
return NpzEmbeddingHelper.load_embedding(source)
|
|
42
|
-
|
|
43
|
-
def check_targets_existed(self):
|
|
44
|
-
targets = [
|
|
45
|
-
self.__get_default_vocab_filepath(),
|
|
46
|
-
self.__get_term_embedding_target()
|
|
47
|
-
]
|
|
48
|
-
return check_targets_existence(targets=targets)
|
|
49
|
-
|
|
50
|
-
# endregion
|
|
51
|
-
|
|
52
|
-
# region embedding-related data
|
|
53
|
-
|
|
54
|
-
def ___get_vocab_source(self):
|
|
55
|
-
""" It is possible to load a predefined embedding from another experiment
|
|
56
|
-
using the related filepath provided by model_io.
|
|
57
|
-
"""
|
|
58
|
-
return self.__get_default_vocab_filepath()
|
|
59
|
-
|
|
60
|
-
def __get_term_embedding_target(self):
|
|
61
|
-
return self.__get_default_embedding_filepath()
|
|
62
|
-
|
|
63
|
-
def __get_term_embedding_source(self):
|
|
64
|
-
return self.__get_default_embedding_filepath()
|
|
65
|
-
|
|
66
|
-
def __get_default_vocab_filepath(self):
|
|
67
|
-
return join(self.__target_dir, self.__vocab_fn_template)
|
|
68
|
-
|
|
69
|
-
def __get_default_embedding_filepath(self):
|
|
70
|
-
return join(self.__target_dir, self.__term_emb_fn_template)
|
|
71
|
-
|
|
72
|
-
# endregion
|