arekit 0.24.0__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. arekit/common/docs/entities_grouping.py +2 -1
  2. arekit/common/docs/parser.py +52 -20
  3. arekit/common/pipeline/base.py +12 -16
  4. arekit/common/pipeline/batching.py +28 -0
  5. arekit/common/pipeline/context.py +5 -1
  6. arekit/common/pipeline/items/base.py +38 -1
  7. arekit/common/pipeline/items/flatten.py +5 -1
  8. arekit/common/pipeline/items/handle.py +2 -1
  9. arekit/common/pipeline/items/iter.py +2 -1
  10. arekit/common/pipeline/items/map.py +2 -1
  11. arekit/common/pipeline/items/map_nested.py +4 -0
  12. arekit/common/pipeline/utils.py +32 -0
  13. arekit/common/service/sqlite.py +36 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +0 -44
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  18. arekit/contrib/utils/data/storages/row_cache.py +6 -1
  19. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  20. arekit/contrib/utils/data/writers/sqlite_native.py +4 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/sampling/base.py +7 -12
  23. arekit/contrib/utils/pipelines/items/sampling/networks.py +3 -2
  24. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  25. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  26. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +2 -2
  27. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  28. arekit/contrib/utils/pipelines/items/text/tokenizer.py +2 -4
  29. arekit/contrib/utils/pipelines/items/text/translator.py +2 -1
  30. arekit/contrib/utils/pipelines/text_opinion/extraction.py +6 -9
  31. arekit/contrib/utils/serializer.py +1 -2
  32. arekit-0.25.0.data/data/logo.png +0 -0
  33. arekit-0.25.0.dist-info/METADATA +82 -0
  34. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/RECORD +38 -153
  35. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  36. arekit/common/docs/objects_parser.py +0 -37
  37. arekit/common/text/parser.py +0 -12
  38. arekit/common/text/partitioning/base.py +0 -4
  39. arekit/common/text/partitioning/terms.py +0 -35
  40. arekit/contrib/source/__init__.py +0 -0
  41. arekit/contrib/source/brat/__init__.py +0 -0
  42. arekit/contrib/source/brat/annot.py +0 -84
  43. arekit/contrib/source/brat/doc.py +0 -28
  44. arekit/contrib/source/brat/entities/__init__.py +0 -0
  45. arekit/contrib/source/brat/entities/compound.py +0 -13
  46. arekit/contrib/source/brat/entities/entity.py +0 -42
  47. arekit/contrib/source/brat/entities/parser.py +0 -53
  48. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  49. arekit/contrib/source/brat/opinions/converter.py +0 -19
  50. arekit/contrib/source/brat/relation.py +0 -32
  51. arekit/contrib/source/brat/sentence.py +0 -69
  52. arekit/contrib/source/brat/sentences_reader.py +0 -128
  53. arekit/contrib/source/download.py +0 -41
  54. arekit/contrib/source/nerel/__init__.py +0 -0
  55. arekit/contrib/source/nerel/entities.py +0 -55
  56. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  57. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  58. arekit/contrib/source/nerel/io_utils.py +0 -62
  59. arekit/contrib/source/nerel/labels.py +0 -241
  60. arekit/contrib/source/nerel/reader.py +0 -46
  61. arekit/contrib/source/nerel/utils.py +0 -24
  62. arekit/contrib/source/nerel/versions.py +0 -12
  63. arekit/contrib/source/nerelbio/__init__.py +0 -0
  64. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  65. arekit/contrib/source/nerelbio/labels.py +0 -265
  66. arekit/contrib/source/nerelbio/reader.py +0 -8
  67. arekit/contrib/source/nerelbio/versions.py +0 -8
  68. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  69. arekit/contrib/source/ruattitudes/collection.py +0 -36
  70. arekit/contrib/source/ruattitudes/doc.py +0 -51
  71. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  72. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  73. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  74. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  75. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  76. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  77. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  78. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  79. arekit/contrib/source/ruattitudes/reader.py +0 -268
  80. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  81. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  82. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  83. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  84. arekit/contrib/source/rusentiframes/collection.py +0 -157
  85. arekit/contrib/source/rusentiframes/effect.py +0 -24
  86. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  87. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  88. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  89. arekit/contrib/source/rusentiframes/role.py +0 -15
  90. arekit/contrib/source/rusentiframes/state.py +0 -24
  91. arekit/contrib/source/rusentiframes/types.py +0 -42
  92. arekit/contrib/source/rusentiframes/value.py +0 -2
  93. arekit/contrib/source/rusentrel/__init__.py +0 -0
  94. arekit/contrib/source/rusentrel/const.py +0 -3
  95. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  96. arekit/contrib/source/rusentrel/entities.py +0 -26
  97. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  98. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  99. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  100. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  101. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  102. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  103. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  104. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  105. arekit/contrib/source/sentinerel/__init__.py +0 -0
  106. arekit/contrib/source/sentinerel/entities.py +0 -52
  107. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  108. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  109. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  110. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  111. arekit/contrib/source/sentinerel/labels.py +0 -53
  112. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  113. arekit/contrib/source/sentinerel/reader.py +0 -42
  114. arekit/contrib/source/synonyms/__init__.py +0 -0
  115. arekit/contrib/source/synonyms/utils.py +0 -19
  116. arekit/contrib/source/zip_utils.py +0 -47
  117. arekit/contrib/utils/connotations/__init__.py +0 -0
  118. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  119. arekit/contrib/utils/download.py +0 -77
  120. arekit/contrib/utils/io_utils/opinions.py +0 -37
  121. arekit/contrib/utils/io_utils/samples.py +0 -79
  122. arekit/contrib/utils/lexicons/__init__.py +0 -0
  123. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  124. arekit/contrib/utils/lexicons/relation.py +0 -42
  125. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  126. arekit/contrib/utils/nn/__init__.py +0 -0
  127. arekit/contrib/utils/nn/rows.py +0 -83
  128. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  129. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  130. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  131. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  132. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  133. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  134. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  135. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  136. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  137. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  138. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  139. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  140. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  141. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  142. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  143. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  144. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  145. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  146. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  147. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  148. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  149. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  150. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  151. arekit/contrib/utils/resources.py +0 -25
  152. arekit/contrib/utils/sources/__init__.py +0 -0
  153. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  154. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  155. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  156. arekit/download_data.py +0 -11
  157. arekit-0.24.0.dist-info/METADATA +0 -23
  158. /arekit/common/{text/partitioning → service}/__init__.py +0 -0
  159. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  160. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,87 +0,0 @@
1
- from enum import Enum
2
- from os import path
3
- from os.path import basename, join
4
-
5
- import enum
6
-
7
- from arekit.contrib.source.sentinerel.folding.factory import SentiNERELFoldingFactory
8
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
9
-
10
-
11
- class SentiNerelVersions(Enum):
12
- """ List of the supported version of this collection
13
- """
14
-
15
- # Initial version.
16
- V1 = "v1_0"
17
- # Updated annotation within the second half of the texts. (September 2022)
18
- V2 = "v2_0"
19
- # Updated annotation within the first half of the texts. (October 2022)
20
- # Become a source of the RuSentNE-2023 competition.
21
- # https://github.com/dialogue-evaluation/RuSentNE-evaluation
22
- V21 = "v2_1"
23
-
24
-
25
- DEFAULT_VERSION = SentiNerelVersions.V21
26
-
27
-
28
- class SentiNerelIOUtils(ZipArchiveUtils):
29
-
30
- inner_root = "sentiment_dataset"
31
-
32
- @staticmethod
33
- def get_archive_filepath(version):
34
- return path.join(SentiNerelIOUtils.get_data_root(), "sentinerel-{}.zip".format(version))
35
-
36
- @staticmethod
37
- def get_annotation_innerpath(filename):
38
- assert(isinstance(filename, str))
39
- return path.join(SentiNerelIOUtils.inner_root, "{}.ann".format(filename))
40
-
41
- @staticmethod
42
- def get_doc_innerpath(filename):
43
- assert(isinstance(filename, str))
44
- return path.join(SentiNerelIOUtils.inner_root, "{}.txt".format(filename))
45
-
46
- @staticmethod
47
- def __iter_filenames_from_dataset(folder_name, version):
48
- assert(isinstance(version, enum.Enum))
49
- assert(isinstance(folder_name, str))
50
-
51
- for filename in SentiNerelIOUtils.iter_filenames_from_zip(version):
52
-
53
- extension = filename[-4:]
54
-
55
- # Crop extension.
56
- filename = filename[:-4]
57
-
58
- if extension != ".txt":
59
- continue
60
-
61
- if not folder_name in filename:
62
- continue
63
-
64
- yield basename(filename)
65
-
66
- # region public methods
67
-
68
- @staticmethod
69
- def iter_collection_filenames(version=DEFAULT_VERSION):
70
- filenames_it = SentiNerelIOUtils.__iter_filenames_from_dataset(
71
- folder_name=SentiNerelIOUtils.inner_root, version=version)
72
-
73
- for doc_id, filename in enumerate(filenames_it):
74
- yield doc_id, filename
75
-
76
- @staticmethod
77
- def read_dataset_split(version=DEFAULT_VERSION, docs_limit=None):
78
- """ Provides a fixed split of the dataset onto
79
- `test` and `training` part:
80
- https://github.com/nicolay-r/SentiNEREL-attitude-extraction
81
- """
82
- return SentiNerelIOUtils.read_from_zip(
83
- inner_path=join(SentiNerelIOUtils.inner_root, "split_fixed.txt"),
84
- process_func=lambda f: SentiNERELFoldingFactory.create_fixed_folding(file=f, limit=docs_limit),
85
- version=version)
86
-
87
- # endregion
@@ -1,53 +0,0 @@
1
- from arekit.common.labels.base import Label
2
-
3
-
4
- class OpinionBelongsTo(Label):
5
- pass
6
-
7
-
8
- class OpinionRelatesTo(Label):
9
- pass
10
-
11
-
12
- class NegEffectFrom(Label):
13
- pass
14
-
15
-
16
- class NegStateFrom(Label):
17
- pass
18
-
19
-
20
- class PosEffectFrom(Label):
21
- pass
22
-
23
-
24
- class PosAuthorFrom(Label):
25
- pass
26
-
27
-
28
- class NegAuthorFrom(Label):
29
- pass
30
-
31
-
32
- class PosStateFrom(Label):
33
- pass
34
-
35
-
36
- class NegativeTo(Label):
37
- pass
38
-
39
-
40
- class PositiveTo(Label):
41
- pass
42
-
43
-
44
- class AlternativeName(Label):
45
- pass
46
-
47
-
48
- class StateBelongsTo(Label):
49
- pass
50
-
51
-
52
- class OriginsFrom(Label):
53
- pass
@@ -1,30 +0,0 @@
1
- from collections import OrderedDict
2
-
3
- from arekit.common.labels.scaler.base import BaseLabelScaler
4
- from arekit.contrib.source.sentinerel import labels
5
-
6
-
7
- class SentiNerelLabelScaler(BaseLabelScaler):
8
- """ This is a complete label scaler of all the labels supported by NEREL dataset.
9
- """
10
-
11
- def __init__(self):
12
-
13
- self.__uint_to_label_dict = OrderedDict([
14
- (labels.OpinionBelongsTo(), 0),
15
- (labels.OpinionRelatesTo(), 1),
16
- (labels.NegEffectFrom(), 2),
17
- (labels.PosEffectFrom(), 3),
18
- (labels.NegStateFrom(), 4),
19
- (labels.PosStateFrom(), 5),
20
- (labels.NegativeTo(), 6),
21
- (labels.PositiveTo(), 7),
22
- (labels.StateBelongsTo(), 8),
23
- (labels.PosAuthorFrom(), 9),
24
- (labels.NegAuthorFrom(), 10),
25
- (labels.AlternativeName(), 11),
26
- (labels.OriginsFrom(), 12)
27
- ])
28
-
29
- super(SentiNerelLabelScaler, self).__init__(int_dict=self.__uint_to_label_dict,
30
- uint_dict=self.__uint_to_label_dict)
@@ -1,42 +0,0 @@
1
- from arekit.contrib.source.brat.annot import BratAnnotationParser
2
- from arekit.contrib.source.brat.doc import BratDocument
3
- from arekit.contrib.source.brat.sentences_reader import BratDocumentSentencesReader
4
- from arekit.contrib.source.sentinerel.entities import SentiNerelEntityCollection
5
- from arekit.contrib.source.sentinerel.io_utils import SentiNerelIOUtils, DEFAULT_VERSION
6
-
7
-
8
- class SentiNerelDocReader(object):
9
-
10
- @staticmethod
11
- def read_text_relations(filename, version):
12
- assert(isinstance(filename, str))
13
-
14
- return SentiNerelIOUtils.read_from_zip(
15
- inner_path=SentiNerelIOUtils.get_annotation_innerpath(filename),
16
- process_func=lambda input_file: [
17
- relation for relation in BratAnnotationParser.parse_annotations(
18
- input_file=input_file, encoding='utf-8-sig')["relations"]],
19
- version=version)
20
-
21
- @staticmethod
22
- def read_document(filename, doc_id, version=DEFAULT_VERSION, entities_to_ignore=None):
23
- assert(isinstance(filename, str))
24
- assert(isinstance(doc_id, int))
25
-
26
- def file_to_doc(input_file):
27
- sentences = BratDocumentSentencesReader.from_file(input_file=input_file, entities=entities)
28
- return BratDocument(doc_id=doc_id, sentences=sentences, text_relations=text_relations)
29
-
30
- # TODO. #398 issue -- in some cases entities might be nested. Therefore we limit the set
31
- # TODO. of the potential named entities.
32
- eti = ["EFFECT_NEG", "EFFECT_POS", "ARGUMENT_NEG", "ARGUMENT_POS", "EVENT"] \
33
- if entities_to_ignore is None else entities_to_ignore
34
-
35
- entities = SentiNerelEntityCollection.read_collection(
36
- filename=filename, version=version, entities_to_ignore=eti)
37
- text_relations = SentiNerelDocReader.read_text_relations(filename=filename, version=version)
38
-
39
- return SentiNerelIOUtils.read_from_zip(
40
- inner_path=SentiNerelIOUtils.get_doc_innerpath(filename=filename),
41
- process_func=file_to_doc,
42
- version=version)
File without changes
@@ -1,19 +0,0 @@
1
- from arekit.common.utils import progress_bar_defined
2
-
3
-
4
- def iter_synonym_groups(input_file, sep=",", desc=""):
5
- """ All the synonyms groups organized in lines, separated by `sep`
6
- """
7
- lines = input_file.readlines()
8
-
9
- lines_it = progress_bar_defined(lines,
10
- total=len(lines),
11
- desc=desc,
12
- unit="opins")
13
-
14
- for line in lines_it:
15
-
16
- if isinstance(line, bytes):
17
- line = line.decode()
18
-
19
- yield line.split(sep)
@@ -1,47 +0,0 @@
1
- import zipfile
2
-
3
- import enum
4
-
5
- from arekit.common import utils
6
-
7
-
8
- class ZipArchiveUtils(object):
9
-
10
- @staticmethod
11
- def get_archive_filepath(version):
12
- raise NotImplementedError()
13
-
14
- @classmethod
15
- def read_from_zip(cls, inner_path, process_func, version):
16
- """
17
- process_func:
18
- func which receives a file reader
19
- """
20
- assert(isinstance(inner_path, str))
21
- assert(callable(process_func))
22
- assert(isinstance(version, enum.Enum))
23
-
24
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
25
- with zip_ref.open(inner_path, mode='r') as c_file:
26
- return process_func(c_file)
27
-
28
- @classmethod
29
- def iter_from_zip(cls, inner_path, process_func, version):
30
- assert(isinstance(inner_path, str))
31
- assert(callable(process_func))
32
- assert(isinstance(version, enum.Enum))
33
-
34
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
35
- with zip_ref.open(inner_path, mode='r') as c_file:
36
- for result in process_func(c_file):
37
- yield result
38
-
39
- @classmethod
40
- def iter_filenames_from_zip(cls, version):
41
- assert(isinstance(version, enum.Enum))
42
- with zipfile.ZipFile(cls.get_archive_filepath(version.value), "r") as zip_ref:
43
- return iter(zip_ref.namelist())
44
-
45
- @staticmethod
46
- def get_data_root():
47
- return utils.get_default_download_dir()
File without changes
@@ -1,23 +0,0 @@
1
- from arekit.common.frames.connotations.provider import FrameConnotationProvider
2
- from arekit.contrib.source.rusentiframes.collection import RuSentiFramesCollection
3
-
4
-
5
- class RuSentiFramesConnotationProvider(FrameConnotationProvider):
6
- """ This is a provider based on A0->A1 label type of RuSentiFrames collection.
7
- For a greater details, checkout the related collection at:
8
- https://github.com/nicolay-r/RuSentiFrames
9
-
10
- Papers:
11
- [1] Natalia Loukachevitch, Nicolay Rusnachenko: Sentiment Frames
12
- for Attitude Extraction in Russian, 2020
13
- [2] Distant Supervision for Sentiment Attitude Extraction, 2019
14
- """
15
-
16
- def __init__(self, collection):
17
- assert(isinstance(collection, RuSentiFramesCollection))
18
- self.__collection = collection
19
-
20
- def try_provide(self, frame_id):
21
- return self.__collection.try_get_frame_polarity(frame_id=frame_id,
22
- role_src='a0',
23
- role_dest='a1')
@@ -1,77 +0,0 @@
1
- import os
2
- import tarfile
3
- from os.path import join, exists
4
-
5
- from arekit.common import utils
6
- from arekit.contrib.utils.np_utils.embedding import NpzEmbeddingHelper
7
- from arekit.contrib.utils.np_utils.vocab import VocabRepositoryUtils
8
-
9
- NEWS_MYSTEM_SKIPGRAM_1000_20_2015 = "news_mystem_skipgram_1000_20_2015.tar.gz"
10
-
11
-
12
- def __get_resource(local_name, check_existance=False, download_if_missed=False):
13
- assert (isinstance(local_name, str))
14
- filepath = join(utils.get_default_download_dir(), local_name)
15
-
16
- if check_existance and not exists(filepath):
17
- if download_if_missed:
18
- download()
19
- # We try to ger the resource again but won't attempt to download it again.
20
- __get_resource(local_name, check_existance=check_existance, download_if_missed=False)
21
- else:
22
- raise Exception("Resource could not be found: {}".format(filepath))
23
-
24
- return filepath
25
-
26
-
27
- def __get_embedding_dir(filepath):
28
- return filepath.replace(".tar.gz", "")
29
-
30
-
31
- def load_embedding_and_vocab(local_name, check_existance=False, download_if_missed=False):
32
- tar_gz_archive = __get_resource(local_name, check_existance=check_existance,
33
- download_if_missed=download_if_missed)
34
- target_dir = __get_embedding_dir(tar_gz_archive)
35
- embedding = NpzEmbeddingHelper.load_embedding(os.path.join(target_dir, "embedding.npz"))
36
- vocab = VocabRepositoryUtils.load(os.path.join(target_dir, "vocab.txt"))
37
- return embedding, vocab
38
-
39
-
40
- def download():
41
- data = {
42
- NEWS_MYSTEM_SKIPGRAM_1000_20_2015: "https://www.dropbox.com/s/0omnlgzgnjhxlmf/{filename}?dl=1".format(
43
- filename=NEWS_MYSTEM_SKIPGRAM_1000_20_2015),
44
- }
45
-
46
- # Perform downloading ...
47
- for local_name, url_link in data.items():
48
- utils.download(dest_file_path=__get_resource(local_name),
49
- source_url=url_link)
50
-
51
- # Untar files ...
52
- for local_name in data.keys():
53
-
54
- if ".tar.gz" not in local_name:
55
- continue
56
-
57
- target_filepath = __get_resource(local_name)
58
- with tarfile.open(target_filepath) as file:
59
- def is_within_directory(directory, target):
60
-
61
- abs_directory = os.path.abspath(directory)
62
- abs_target = os.path.abspath(target)
63
-
64
- prefix = os.path.commonprefix([abs_directory, abs_target])
65
-
66
- return prefix == abs_directory
67
-
68
- def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
69
-
70
- for member in tar.getmembers():
71
- member_path = os.path.join(path, member.name)
72
- if not is_within_directory(path, member_path):
73
- raise Exception("Attempted Path Traversal in Tar File")
74
-
75
- tar.extractall(path, members, numeric_owner=numeric_owner)
76
-
77
- safe_extract(file, __get_embedding_dir(target_filepath))
@@ -1,37 +0,0 @@
1
- from os.path import join
2
-
3
- from arekit.contrib.utils.data.readers.base import BaseReader
4
- from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
5
- from arekit.contrib.utils.io_utils.utils import filename_template
6
-
7
-
8
- class OpinionsIO(BaseSamplesIO):
9
-
10
- def __init__(self, target_dir, reader=None, prefix="opinion"):
11
- assert(isinstance(reader, BaseReader))
12
- self.__target_dir = target_dir
13
- self.__prefix = prefix
14
- self.__reader = reader
15
- self.__target_extension = reader.extension()
16
-
17
- @property
18
- def Reader(self):
19
- return self.__reader
20
-
21
- def create_target(self, data_type):
22
- return self.__get_input_opinions_target(data_type)
23
-
24
- def __get_input_opinions_target(self, data_type):
25
- template = filename_template(data_type=data_type)
26
- return self.__get_filepath(out_dir=self.__target_dir,
27
- template=template,
28
- prefix=self.__prefix,
29
- extension=self.__target_extension)
30
-
31
- @staticmethod
32
- def __get_filepath(out_dir, template, prefix, extension):
33
- assert(isinstance(template, str))
34
- assert(isinstance(prefix, str))
35
- assert(isinstance(extension, str))
36
- return join(out_dir, "{prefix}-{template}{extension}".format(
37
- prefix=prefix, template=template, extension=extension))
@@ -1,79 +0,0 @@
1
- import logging
2
- from os.path import join
3
-
4
- from arekit.contrib.utils.data.readers.base import BaseReader
5
- from arekit.common.experiment.api.base_samples_io import BaseSamplesIO
6
- from arekit.contrib.utils.data.writers.base import BaseWriter
7
- from arekit.contrib.utils.io_utils.utils import filename_template, check_targets_existence
8
-
9
- logger = logging.getLogger(__name__)
10
- logging.basicConfig(level=logging.INFO)
11
-
12
-
13
- class SamplesIO(BaseSamplesIO):
14
- """ Samples default IO utils for samples.
15
- Sample is a text part which include pair of attitude participants.
16
- This class allows to provide saver and loader for such entries, bubbed as samples.
17
- Samples required for machine learning training/inferring.
18
- """
19
-
20
- def __init__(self, target_dir, writer=None, reader=None, prefix="sample"):
21
- assert(isinstance(target_dir, str))
22
- assert(isinstance(prefix, str))
23
- assert(isinstance(writer, BaseWriter) or writer is None)
24
- assert(isinstance(reader, BaseReader) or reader is None)
25
- self.__target_dir = target_dir
26
- self.__prefix = prefix
27
- self.__writer = writer
28
- self.__reader = reader
29
-
30
- self.__target_extension = None
31
- if writer is not None:
32
- self.__target_extension = writer.extension()
33
- elif reader is not None:
34
- self.__target_extension = reader.extension()
35
-
36
- # region public methods
37
-
38
- @property
39
- def Prefix(self):
40
- return self.__prefix
41
-
42
- @property
43
- def Reader(self):
44
- return self.__reader
45
-
46
- @property
47
- def Writer(self):
48
- return self.__writer
49
-
50
- def create_target(self, data_type):
51
- return self.__get_input_sample_target(data_type)
52
-
53
- def check_targets_existed(self, data_types_iter):
54
- for data_type in data_types_iter:
55
-
56
- targets = [
57
- self.__get_input_sample_target(data_type=data_type),
58
- ]
59
-
60
- if not check_targets_existence(targets=targets):
61
- return False
62
- return True
63
-
64
- # endregion
65
-
66
- def __get_input_sample_target(self, data_type):
67
- template = filename_template(data_type=data_type)
68
- return self.__get_filepath(out_dir=self.__target_dir,
69
- template=template,
70
- prefix=self.__prefix,
71
- extension=self.__target_extension)
72
-
73
- @staticmethod
74
- def __get_filepath(out_dir, template, prefix, extension):
75
- assert(isinstance(template, str))
76
- assert(isinstance(prefix, str))
77
- assert(isinstance(extension, str))
78
- return join(out_dir, "{prefix}-{template}{extension}".format(
79
- prefix=prefix, template=template, extension=extension))
File without changes
@@ -1,41 +0,0 @@
1
- from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
2
-
3
-
4
- class Lexicon(object):
5
-
6
- @property
7
- def ToneKey(self):
8
- return 'tone'
9
-
10
- @property
11
- def TermKey(self):
12
- return 'term'
13
-
14
- def __init__(self, dataframe):
15
- self.__lexicon_df = dataframe
16
-
17
- @classmethod
18
- def load(cls, filepath, separator=','):
19
- reader = PandasCsvReader(compression=None, sep=separator)
20
- return cls(reader.read(filepath))
21
-
22
- def get_score(self, lemma):
23
- assert(type(lemma) == str)
24
- s = self.__lexicon_df[lemma.encode('utf-8') == self.__lexicon_df[self.TermKey]]
25
- return s[self.ToneKey].values[0] if len(s) > 0 else 0
26
-
27
- def has_term(self, term):
28
- assert(type(term) == str)
29
- s = self.__lexicon_df[term.encode('utf-8') == self.__lexicon_df[self.TermKey]]
30
- return len(s) > 0
31
-
32
- def __iter__(self):
33
- for term in self.__lexicon_df[self.TermKey]:
34
- yield term
35
-
36
- def __contains__(self, item):
37
- assert(isinstance(item, str))
38
- result = self.__lexicon_df[self.__lexicon_df[self.TermKey] == item.encode('utf-8')]
39
- return len(result) > 0
40
-
41
-
@@ -1,42 +0,0 @@
1
- from arekit.contrib.utils.data.readers.csv_pd import PandasCsvReader
2
-
3
-
4
- class RelationLexicon(object):
5
-
6
- def __init__(self, dataframe):
7
- self.__check(dataframe)
8
- self.__lexicon = dataframe
9
-
10
- @classmethod
11
- def load(cls, filepath, separator=','):
12
- reader = PandasCsvReader(compression=None, sep=separator)
13
- return cls(reader.read(filepath))
14
-
15
- @staticmethod
16
- def __check(df):
17
- for index in df.index:
18
- relation = df.loc[index][0]
19
- assert(len(relation.split('<->')) == 2)
20
-
21
- @staticmethod
22
- def __create_key(l, r):
23
- assert(type(l) == str)
24
- assert(type(r) == str)
25
- return '<->'.join([l, r])
26
-
27
- def get_score(self, left, right):
28
- assert(type(left) == str)
29
- assert(type(right) == str)
30
-
31
- lr_key = self.__create_key(left, right)
32
- rl_key = self.__create_key(right, left)
33
-
34
- lr_score = self.__lexicon[lr_key == self.__lexicon['relation']]
35
- rl_score = self.__lexicon[rl_key == self.__lexicon['relation']]
36
-
37
- if len(lr_score) > 0:
38
- return lr_score['tone'].values[0]
39
- if len(rl_score) > 0:
40
- return rl_score['tone'].values[0]
41
-
42
- return None
@@ -1,37 +0,0 @@
1
- import importlib
2
- import zipfile
3
- from os import path
4
-
5
-
6
- from arekit.contrib.source.zip_utils import ZipArchiveUtils
7
- from arekit.contrib.utils.lexicons.lexicon import Lexicon
8
-
9
-
10
- class RuSentiLexLexicon(Lexicon):
11
- """
12
- RuSentiLex Lexicon wrapper for csv file stored in /data folder.
13
- """
14
-
15
- __INNER_PATH = 'rusentilex.csv'
16
-
17
- @property
18
- def ToneKey(self):
19
- return 'tone'
20
-
21
- @property
22
- def TermKey(self):
23
- return 'term'
24
-
25
- @staticmethod
26
- def __get_archive_filepath():
27
- return path.join(ZipArchiveUtils.get_data_root(), "rusentilex.zip")
28
-
29
- @classmethod
30
- def from_zip(cls):
31
- """ Using Pandas API to read lexicon.
32
- """
33
- pd = importlib.import_module("pandas")
34
- with zipfile.ZipFile(cls.__get_archive_filepath(), "r") as zip_ref:
35
- with zip_ref.open(cls.__INNER_PATH, mode='r') as csv_file:
36
- df = pd.read_csv(csv_file, sep=',')
37
- return cls(df)
File without changes