arekit 0.24.0__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. arekit/common/docs/entities_grouping.py +2 -1
  2. arekit/common/docs/parser.py +52 -20
  3. arekit/common/pipeline/base.py +12 -16
  4. arekit/common/pipeline/batching.py +28 -0
  5. arekit/common/pipeline/context.py +5 -1
  6. arekit/common/pipeline/items/base.py +38 -1
  7. arekit/common/pipeline/items/flatten.py +5 -1
  8. arekit/common/pipeline/items/handle.py +2 -1
  9. arekit/common/pipeline/items/iter.py +2 -1
  10. arekit/common/pipeline/items/map.py +2 -1
  11. arekit/common/pipeline/items/map_nested.py +4 -0
  12. arekit/common/pipeline/utils.py +32 -0
  13. arekit/common/service/sqlite.py +36 -0
  14. arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
  15. arekit/common/utils.py +0 -44
  16. arekit/contrib/utils/data/contents/opinions.py +13 -3
  17. arekit/contrib/utils/data/readers/sqlite.py +14 -0
  18. arekit/contrib/utils/data/storages/row_cache.py +6 -1
  19. arekit/contrib/utils/data/storages/sqlite_based.py +17 -0
  20. arekit/contrib/utils/data/writers/sqlite_native.py +4 -0
  21. arekit/contrib/utils/io_utils/utils.py +1 -18
  22. arekit/contrib/utils/pipelines/items/sampling/base.py +7 -12
  23. arekit/contrib/utils/pipelines/items/sampling/networks.py +3 -2
  24. arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
  25. arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
  26. arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +2 -2
  27. arekit/contrib/utils/pipelines/items/text/frames_negation.py +2 -1
  28. arekit/contrib/utils/pipelines/items/text/tokenizer.py +2 -4
  29. arekit/contrib/utils/pipelines/items/text/translator.py +2 -1
  30. arekit/contrib/utils/pipelines/text_opinion/extraction.py +6 -9
  31. arekit/contrib/utils/serializer.py +1 -2
  32. arekit-0.25.0.data/data/logo.png +0 -0
  33. arekit-0.25.0.dist-info/METADATA +82 -0
  34. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/RECORD +38 -153
  35. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/WHEEL +1 -1
  36. arekit/common/docs/objects_parser.py +0 -37
  37. arekit/common/text/parser.py +0 -12
  38. arekit/common/text/partitioning/base.py +0 -4
  39. arekit/common/text/partitioning/terms.py +0 -35
  40. arekit/contrib/source/__init__.py +0 -0
  41. arekit/contrib/source/brat/__init__.py +0 -0
  42. arekit/contrib/source/brat/annot.py +0 -84
  43. arekit/contrib/source/brat/doc.py +0 -28
  44. arekit/contrib/source/brat/entities/__init__.py +0 -0
  45. arekit/contrib/source/brat/entities/compound.py +0 -13
  46. arekit/contrib/source/brat/entities/entity.py +0 -42
  47. arekit/contrib/source/brat/entities/parser.py +0 -53
  48. arekit/contrib/source/brat/opinions/__init__.py +0 -0
  49. arekit/contrib/source/brat/opinions/converter.py +0 -19
  50. arekit/contrib/source/brat/relation.py +0 -32
  51. arekit/contrib/source/brat/sentence.py +0 -69
  52. arekit/contrib/source/brat/sentences_reader.py +0 -128
  53. arekit/contrib/source/download.py +0 -41
  54. arekit/contrib/source/nerel/__init__.py +0 -0
  55. arekit/contrib/source/nerel/entities.py +0 -55
  56. arekit/contrib/source/nerel/folding/__init__.py +0 -0
  57. arekit/contrib/source/nerel/folding/fixed.py +0 -74
  58. arekit/contrib/source/nerel/io_utils.py +0 -62
  59. arekit/contrib/source/nerel/labels.py +0 -241
  60. arekit/contrib/source/nerel/reader.py +0 -46
  61. arekit/contrib/source/nerel/utils.py +0 -24
  62. arekit/contrib/source/nerel/versions.py +0 -12
  63. arekit/contrib/source/nerelbio/__init__.py +0 -0
  64. arekit/contrib/source/nerelbio/io_utils.py +0 -62
  65. arekit/contrib/source/nerelbio/labels.py +0 -265
  66. arekit/contrib/source/nerelbio/reader.py +0 -8
  67. arekit/contrib/source/nerelbio/versions.py +0 -8
  68. arekit/contrib/source/ruattitudes/__init__.py +0 -0
  69. arekit/contrib/source/ruattitudes/collection.py +0 -36
  70. arekit/contrib/source/ruattitudes/doc.py +0 -51
  71. arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
  72. arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
  73. arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
  74. arekit/contrib/source/ruattitudes/io_utils.py +0 -56
  75. arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
  76. arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
  77. arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
  78. arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
  79. arekit/contrib/source/ruattitudes/reader.py +0 -268
  80. arekit/contrib/source/ruattitudes/sentence.py +0 -73
  81. arekit/contrib/source/ruattitudes/synonyms.py +0 -17
  82. arekit/contrib/source/ruattitudes/text_object.py +0 -59
  83. arekit/contrib/source/rusentiframes/__init__.py +0 -0
  84. arekit/contrib/source/rusentiframes/collection.py +0 -157
  85. arekit/contrib/source/rusentiframes/effect.py +0 -24
  86. arekit/contrib/source/rusentiframes/io_utils.py +0 -19
  87. arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
  88. arekit/contrib/source/rusentiframes/polarity.py +0 -35
  89. arekit/contrib/source/rusentiframes/role.py +0 -15
  90. arekit/contrib/source/rusentiframes/state.py +0 -24
  91. arekit/contrib/source/rusentiframes/types.py +0 -42
  92. arekit/contrib/source/rusentiframes/value.py +0 -2
  93. arekit/contrib/source/rusentrel/__init__.py +0 -0
  94. arekit/contrib/source/rusentrel/const.py +0 -3
  95. arekit/contrib/source/rusentrel/docs_reader.py +0 -51
  96. arekit/contrib/source/rusentrel/entities.py +0 -26
  97. arekit/contrib/source/rusentrel/io_utils.py +0 -125
  98. arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
  99. arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
  100. arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
  101. arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
  102. arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
  103. arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
  104. arekit/contrib/source/rusentrel/synonyms.py +0 -17
  105. arekit/contrib/source/sentinerel/__init__.py +0 -0
  106. arekit/contrib/source/sentinerel/entities.py +0 -52
  107. arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
  108. arekit/contrib/source/sentinerel/folding/factory.py +0 -31
  109. arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
  110. arekit/contrib/source/sentinerel/io_utils.py +0 -87
  111. arekit/contrib/source/sentinerel/labels.py +0 -53
  112. arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
  113. arekit/contrib/source/sentinerel/reader.py +0 -42
  114. arekit/contrib/source/synonyms/__init__.py +0 -0
  115. arekit/contrib/source/synonyms/utils.py +0 -19
  116. arekit/contrib/source/zip_utils.py +0 -47
  117. arekit/contrib/utils/connotations/__init__.py +0 -0
  118. arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
  119. arekit/contrib/utils/download.py +0 -77
  120. arekit/contrib/utils/io_utils/opinions.py +0 -37
  121. arekit/contrib/utils/io_utils/samples.py +0 -79
  122. arekit/contrib/utils/lexicons/__init__.py +0 -0
  123. arekit/contrib/utils/lexicons/lexicon.py +0 -41
  124. arekit/contrib/utils/lexicons/relation.py +0 -42
  125. arekit/contrib/utils/lexicons/rusentilex.py +0 -37
  126. arekit/contrib/utils/nn/__init__.py +0 -0
  127. arekit/contrib/utils/nn/rows.py +0 -83
  128. arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
  129. arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
  130. arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
  131. arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
  132. arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
  133. arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
  134. arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
  135. arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
  136. arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
  137. arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
  138. arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
  139. arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
  140. arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
  141. arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
  142. arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
  143. arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
  144. arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
  145. arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
  146. arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
  147. arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
  148. arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
  149. arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
  150. arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
  151. arekit/contrib/utils/resources.py +0 -25
  152. arekit/contrib/utils/sources/__init__.py +0 -0
  153. arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
  154. arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
  155. arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
  156. arekit/download_data.py +0 -11
  157. arekit-0.24.0.dist-info/METADATA +0 -23
  158. /arekit/common/{text/partitioning → service}/__init__.py +0 -0
  159. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/LICENSE +0 -0
  160. {arekit-0.24.0.dist-info → arekit-0.25.0.dist-info}/top_level.txt +0 -0
@@ -1,35 +0,0 @@
1
- from collections.abc import Iterable
2
-
3
- from arekit.common.bound import Bound
4
- from arekit.common.text.partitioning.base import BasePartitioning
5
-
6
-
7
- class TermsPartitioning(BasePartitioning):
8
- """ NOTE: considering that provided parts
9
- has no intersections between each other
10
- """
11
-
12
- def provide(self, text, parts_it):
13
- assert(isinstance(text, list))
14
- assert(isinstance(parts_it, Iterable))
15
-
16
- start = 0
17
- parts = []
18
- for value, bound in parts_it:
19
- assert(isinstance(bound, Bound))
20
- assert(bound.Position >= start)
21
-
22
- # Release everythig till the current value position.
23
- part = text[start:bound.Position]
24
-
25
- parts.extend(part)
26
-
27
- # Release the entity value.
28
- parts.extend([value])
29
-
30
- start = bound.Position + bound.Length
31
-
32
- # Release everything after the last entity.
33
- parts.extend(text[start:len(text)])
34
-
35
- return parts
File without changes
File without changes
@@ -1,84 +0,0 @@
1
- from arekit.contrib.source.brat.entities.entity import BratEntity
2
- from arekit.contrib.source.brat.relation import BratRelation
3
-
4
-
5
- class BratAnnotationParser:
6
-
7
- ENTITIES = "entities"
8
- RELATIONS = "relations"
9
-
10
- @staticmethod
11
- def __non_prefixed_id(value):
12
- assert (isinstance(value, str))
13
- return value[1:]
14
-
15
- @staticmethod
16
- def handle_entity(args):
17
- """ T2 Location 10 23 South America
18
- T1 Location 0 5;16 23 North America
19
- """
20
- assert(len(args) == 3)
21
-
22
- e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
23
- entity_params = args[1].split()
24
-
25
- if len(entity_params) != 3:
26
- # We do not support the case of a non-continuous entity mentions.
27
- return None
28
-
29
- e_str_type, e_begin, e_end = entity_params
30
-
31
- return BratEntity(id_in_doc=e_id,
32
- e_type=e_str_type,
33
- index_begin=int(e_begin),
34
- index_end=int(e_end),
35
- childs=None,
36
- value=args[2].strip())
37
-
38
- @staticmethod
39
- def handle_relation(args):
40
- """ Example:
41
- R1 Origin Arg1:T3 Arg2:T4
42
- """
43
-
44
- # Parse identifier index.
45
- e_id = args[0][1:]
46
-
47
- # Parse relation arguments.
48
- rel_type, source, target = args[1].split()
49
-
50
- source_id = source.split(':')[1]
51
- target_id = target.split(':')[1]
52
-
53
- return BratRelation(id_in_doc=e_id,
54
- source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)),
55
- target_id=int(BratAnnotationParser.__non_prefixed_id(target_id)),
56
- rel_type=rel_type)
57
-
58
- @staticmethod
59
- def parse_annotations(input_file, encoding='utf-8'):
60
- """ Read annotation collection from file
61
- """
62
- entities = []
63
- relations = []
64
-
65
- for line in input_file.readlines():
66
- line = line.decode(encoding)
67
-
68
- args = line.split('\t')
69
-
70
- record_type = args[0][0]
71
-
72
- # Entities (objects) are prefixed with `T`
73
- if record_type == "T":
74
- entity = BratAnnotationParser.handle_entity(args)
75
- if entity is not None:
76
- entities.append(entity)
77
-
78
- elif record_type == "R":
79
- relations.append(BratAnnotationParser.handle_relation(args))
80
-
81
- return {
82
- BratAnnotationParser.ENTITIES: entities,
83
- BratAnnotationParser.RELATIONS: relations
84
- }
@@ -1,28 +0,0 @@
1
- from arekit.common.docs.base import Document
2
- from arekit.contrib.source.brat.entities.entity import BratEntity
3
- from arekit.contrib.source.brat.sentence import BratSentence
4
-
5
-
6
- class BratDocument(Document):
7
-
8
- def __init__(self, doc_id, sentences, text_relations):
9
- assert(isinstance(text_relations, list) or text_relations is None)
10
- super(BratDocument, self).__init__(doc_id=doc_id, sentences=sentences)
11
- self.__text_relations = text_relations
12
- self.__entity_by_id = {}
13
- for sentence in sentences:
14
- assert(isinstance(sentence, BratSentence))
15
- for brat_entity, _ in sentence.iter_entity_with_local_bounds():
16
- assert(isinstance(brat_entity, BratEntity))
17
- self.__entity_by_id[brat_entity.ID] = brat_entity
18
-
19
- @property
20
- def Relations(self):
21
- for brat_relation in self.__text_relations:
22
- yield brat_relation
23
-
24
- def contains_entity(self, entity_id):
25
- return entity_id in self.__entity_by_id
26
-
27
- def get_entity_by_id(self, entity_id):
28
- return self.__entity_by_id[entity_id]
File without changes
@@ -1,13 +0,0 @@
1
- from arekit.contrib.source.brat.entities.entity import BratEntity
2
-
3
-
4
- class BratCompoundEntity(BratEntity):
5
- """ Entity which contains the hierarchy of the other entities.
6
- """
7
-
8
- @classmethod
9
- def from_list(cls, root, childs):
10
- assert(isinstance(root, BratEntity))
11
- assert(isinstance(childs, list) and len(childs) > 0)
12
- return cls(id_in_doc=root.ID, value=root.Value, e_type=root.Type, childs=childs,
13
- index_begin=root.IndexBegin, index_end=root.IndexEnd)
@@ -1,42 +0,0 @@
1
- from arekit.common.entities.base import Entity
2
-
3
-
4
- class BratEntity(Entity):
5
- """ Annotated entity in Brat-based collection corpus.
6
- Provides bounds, i.e. char indices in related sentence.
7
- """
8
-
9
- def __init__(self, id_in_doc, e_type, index_begin, index_end, value, childs, display_value=None, group_index=None):
10
- """ index_begin: int
11
- - char index (in case of string type of `text`)
12
- - term index (in case of list type of `text`)
13
- index_end: int
14
- - char index (in case of string type of `text`)
15
- - term index (in case of list type of `text`)
16
- """
17
- assert(isinstance(e_type, str))
18
- assert(isinstance(index_begin, int))
19
- assert(isinstance(index_end, int))
20
- super(BratEntity, self).__init__(value=value, e_type=e_type, childs=childs,
21
- display_value=display_value, group_index=group_index)
22
-
23
- self.__e_type = e_type
24
- self.__begin = index_begin
25
- self.__end = index_end
26
- self.__id = id_in_doc
27
-
28
- @property
29
- def IndexBegin(self):
30
- return self.__begin
31
-
32
- @property
33
- def IndexEnd(self):
34
- return self.__end
35
-
36
- @property
37
- def Type(self):
38
- return self.__e_type
39
-
40
- @property
41
- def ID(self):
42
- return self.__id
@@ -1,53 +0,0 @@
1
- from arekit.common.docs.objects_parser import SentenceObjectsParserPipelineItem
2
- from arekit.common.pipeline.context import PipelineContext
3
- from arekit.common.text.partitioning.str import StringPartitioning
4
- from arekit.common.text.partitioning.terms import TermsPartitioning
5
- from arekit.contrib.source.brat.sentence import BratSentence
6
-
7
-
8
- class BratTextEntitiesParser(SentenceObjectsParserPipelineItem):
9
-
10
- KEY = "sentence"
11
-
12
- ################################
13
- # NOTE: Supported partitionings.
14
- ################################
15
- # By default, BRAT annotation proposes to adopt entities annotation
16
- # based on string input, which means that entity ends described as
17
- # `char-ind-begin` and `char-ind-end`. However, the latter could be
18
- # expanded to list of terms, which means that we deal with `ind-begin`
19
- # and `ind-end` list indices.
20
- __supported_partitionings = {
21
- "string": StringPartitioning(),
22
- "terms": TermsPartitioning()
23
- }
24
-
25
- def __init__(self, partitioning="string"):
26
- assert(isinstance(partitioning, str))
27
- super(BratTextEntitiesParser, self).__init__(self.__supported_partitionings[partitioning])
28
-
29
- # region protected methods
30
-
31
- def _get_text(self, pipeline_ctx):
32
- sentence = self.__get_sentence(pipeline_ctx)
33
- return sentence.Text
34
-
35
- def _get_parts_provider_func(self, input_data, pipeline_ctx):
36
- sentence = self.__get_sentence(pipeline_ctx)
37
- return self.__iter_subs_values_with_bounds(sentence)
38
-
39
- # endregion
40
-
41
- # region private methods
42
-
43
- def __get_sentence(self, pipeline_ctx):
44
- assert(isinstance(pipeline_ctx, PipelineContext))
45
- assert(self.KEY in pipeline_ctx)
46
- return pipeline_ctx.provide(self.KEY)
47
-
48
- @staticmethod
49
- def __iter_subs_values_with_bounds(sentence):
50
- assert(isinstance(sentence, BratSentence))
51
- return sentence.iter_entity_with_local_bounds()
52
-
53
- # endregion
File without changes
@@ -1,19 +0,0 @@
1
- from arekit.common.labels.str_fmt import StringLabelsFormatter
2
- from arekit.common.text_opinions.base import TextOpinion
3
- from arekit.contrib.source.brat.relation import BratRelation
4
-
5
-
6
- class BratRelationConverter(object):
7
-
8
- @staticmethod
9
- def to_text_opinion(brat_relation, doc_id, label_formatter):
10
- """ Converts opinion into document-level referenced opinion
11
- """
12
- assert (isinstance(brat_relation, BratRelation))
13
- assert(isinstance(label_formatter, StringLabelsFormatter))
14
-
15
- return TextOpinion(doc_id=doc_id,
16
- text_opinion_id=int(brat_relation.ID),
17
- source_id=brat_relation.SourceID,
18
- target_id=brat_relation.TargetID,
19
- label=label_formatter.str_to_label(brat_relation.Type))
@@ -1,32 +0,0 @@
1
- class BratRelation(object):
2
-
3
- def __init__(self, id_in_doc, source_id, target_id, rel_type):
4
- assert(isinstance(id_in_doc, str))
5
- assert(isinstance(source_id, int))
6
- assert(isinstance(target_id, int))
7
- assert(isinstance(rel_type, str))
8
-
9
- self.__id = id_in_doc
10
- self.__rel_type = rel_type
11
- self.__source_id = source_id
12
- self.__target_id = target_id
13
-
14
- @property
15
- def ID(self):
16
- return self.__id
17
-
18
- @property
19
- def Type(self):
20
- return self.__rel_type
21
-
22
- @property
23
- def SourceID(self):
24
- """ Arg0.
25
- """
26
- return self.__source_id
27
-
28
- @property
29
- def TargetID(self):
30
- """ Arg1.
31
- """
32
- return self.__target_id
@@ -1,69 +0,0 @@
1
- from functools import cmp_to_key
2
-
3
- from arekit.common.bound import Bound
4
- from arekit.common.docs.sentence import BaseDocumentSentence
5
- from arekit.contrib.source.brat.entities.compound import BratCompoundEntity
6
- from arekit.contrib.source.brat.entities.entity import BratEntity
7
-
8
-
9
- class BratSentence(BaseDocumentSentence):
10
- """ Represent a raw sentence of BRAT.
11
- Provides text could be used to parse then.
12
- Provides API to store entities.
13
- """
14
-
15
- def __init__(self, text, index_begin, entities):
16
- """ entities: list of BratEntities
17
- index_begin: int
18
- - char index (in case of string type of `text`)
19
- - term index (in case of list type of `text`)
20
- """
21
- assert(isinstance(text, str) or isinstance(text, list))
22
- assert(isinstance(index_begin, int))
23
- assert(isinstance(entities, list))
24
- super(BratSentence, self).__init__(text=text)
25
- self.__index_begin = index_begin
26
- self.__entities = entities
27
-
28
- @staticmethod
29
- def cmp_entities(a, b):
30
- assert(isinstance(a, BratEntity))
31
- assert(isinstance(b, BratEntity))
32
- if a.IndexBegin != b.IndexBegin:
33
- # Ordered by appearance
34
- return a.IndexBegin - b.IndexBegin
35
- else:
36
- # Ordered by length first
37
- b_length = b.IndexEnd - b.IndexBegin
38
- a_length = a.IndexEnd - a.IndexBegin
39
- return b_length - a_length
40
-
41
- def iter_entity_with_local_bounds(self):
42
- self.__entities.sort(key=cmp_to_key(lambda a, b: self.cmp_entities(a, b)))
43
-
44
- bounds_and_entities = []
45
-
46
- # Merging nested entities.
47
- for entity in self.__entities:
48
- start = entity.IndexBegin - self.__index_begin
49
- end = entity.IndexEnd - self.__index_begin
50
- bound = Bound(pos=start, length=end - start)
51
-
52
- updated = False
53
- if len(bounds_and_entities) > 0:
54
- last_bound, last_entities = bounds_and_entities[-1]
55
- if bound.itersects_with(last_bound):
56
- # Update.
57
- last_entities.append(entity)
58
- bounds_and_entities[-1] = (bound.intersect(last_bound), last_entities)
59
- updated = True
60
-
61
- if not updated:
62
- bounds_and_entities.append((bound, [entity]))
63
-
64
- # Returning result.
65
- for item in bounds_and_entities:
66
- bound, entities = item
67
- entity = entities[0] if len(entities) == 1 else \
68
- BratCompoundEntity.from_list(root=entities[0], childs=entities[1:])
69
- yield entity, bound
@@ -1,128 +0,0 @@
1
- from arekit.common.entities.collection import EntityCollection
2
- from arekit.contrib.source.brat.entities.entity import BratEntity
3
- from arekit.contrib.source.brat.sentence import BratSentence
4
-
5
-
6
- class BratDocumentSentencesReader(object):
7
-
8
- @staticmethod
9
- def from_file(input_file, entities, line_handler=None, skip_entity_func=None):
10
- assert(isinstance(entities, EntityCollection))
11
- assert(callable(skip_entity_func) or skip_entity_func is None)
12
-
13
- sentences_data = BratDocumentSentencesReader._parse_sentences(
14
- input_file=input_file, line_handler=line_handler)
15
-
16
- sentence_entities = BratDocumentSentencesReader._parse_entities(
17
- sentences_data=sentences_data,
18
- entities=entities,
19
- skip_entity_func=skip_entity_func)
20
-
21
- # Convert all the content to brat sentences.
22
- brat_sentences = []
23
- for s_ind, s_dict in enumerate(sentences_data):
24
- brat_sentence = BratSentence(text=s_dict["text"],
25
- index_begin=s_dict["ind_begin"],
26
- entities=sentence_entities[s_ind])
27
- brat_sentences.append(brat_sentence)
28
-
29
- return brat_sentences
30
-
31
- @staticmethod
32
- def from_sentences_data(entities, sentences_data, skip_entity_func=None):
33
- assert(isinstance(entities, EntityCollection))
34
-
35
- sentence_entities = BratDocumentSentencesReader._parse_entities(
36
- sentences_data=sentences_data,
37
- entities=entities,
38
- skip_entity_func=skip_entity_func)
39
-
40
- # Convert all the content to brat sentences.
41
- brat_sentences = []
42
- for s_ind, s_dict in enumerate(sentences_data):
43
- brat_sentence = BratSentence(text=s_dict["text"],
44
- index_begin=s_dict["ind_begin"],
45
- entities=sentence_entities[s_ind])
46
- brat_sentences.append(brat_sentence)
47
-
48
- return brat_sentences
49
-
50
- @staticmethod
51
- def __is_sentence_contains(sentence_data, entity):
52
- assert(isinstance(sentence_data, dict))
53
- assert(isinstance(entity, BratEntity))
54
- return entity.IndexBegin >= sentence_data["ind_begin"] and \
55
- entity.IndexEnd <= sentence_data["ind_end"]
56
-
57
- @staticmethod
58
- def _parse_entities(sentences_data, entities, skip_entity_func):
59
- """ Sentences is a list of json-like data (dictionaries).
60
- """
61
- assert(isinstance(sentences_data, list))
62
- assert(isinstance(entities, EntityCollection))
63
-
64
- entities_in_sentences = [[] for _ in range(len(sentences_data))]
65
-
66
- s_ind = 0
67
- e_ind = 0
68
-
69
- while s_ind < len(sentences_data) and e_ind < len(entities):
70
- e = entities.get_entity_by_index(e_ind)
71
- assert (isinstance(e, BratEntity))
72
-
73
- s = sentences_data[s_ind]
74
- entities_in_sentence = entities_in_sentences[s_ind]
75
-
76
- # If entity goes after the current sentence.
77
- if e.IndexBegin > s["ind_end"]:
78
- s_ind += 1
79
- continue
80
-
81
- if skip_entity_func is not None and skip_entity_func(e):
82
- e_ind += 1
83
- continue
84
-
85
- if BratDocumentSentencesReader.__is_sentence_contains(sentence_data=s, entity=e):
86
- entities_in_sentence.append(e)
87
- e_ind += 1
88
- continue
89
-
90
- if e.IndexEnd > s["ind_end"]:
91
- # Intersects with the right border of sentence
92
- s_ind += 1
93
- continue
94
-
95
- if e.IndexBegin < s["ind_begin"]:
96
- # Intersects with the left border of sentence
97
- e_ind += 1
98
- continue
99
-
100
- raise Exception("e_i:{} e:('{}',{},{}), s_i:{}, s_b: [{} {}]".format(
101
- e_ind,
102
- e.Value, e.IndexBegin, e.IndexEnd,
103
- s_ind,
104
- s["ind_begin"], s["ind_end"]))
105
-
106
- return entities_in_sentences
107
-
108
- @staticmethod
109
- def _parse_sentences(input_file, line_handler):
110
- assert(callable(line_handler) or line_handler is None)
111
- sentences = []
112
- line_start = 0
113
-
114
- for line in input_file.readlines():
115
-
116
- line = line.decode('utf-8')
117
- handled_line = line_handler(line) if line_handler is not None else line
118
-
119
- assert(len(line) == len(handled_line))
120
-
121
- line_end = line_start + len(handled_line) - 1
122
-
123
- if handled_line != str('\r\n'):
124
- sentences.append({"text": handled_line, "ind_begin": line_start, "ind_end": line_end})
125
-
126
- line_start = line_end + 1
127
-
128
- return sentences
@@ -1,41 +0,0 @@
1
- from os.path import join
2
-
3
- from arekit.common import utils
4
-
5
-
6
- def download():
7
- root_dir = utils.get_default_download_dir()
8
-
9
- data = {
10
- # RuSentiLex
11
- "rusentilex.zip": "https://www.dropbox.com/s/bdsl3kney30y45z/rusentilex.zip?dl=1",
12
- # RuSentRel-v1.1
13
- "rusentrel-v1_1.zip": "https://www.dropbox.com/s/6aw5jv84jf5hrl2/rusentrel-v1_1.zip?dl=1",
14
- # RuSentiFrames
15
- "rusentiframes-v1_0.zip": "https://www.dropbox.com/s/zvkis77li3f40bm/rusentiframes-v1_0.zip?dl=1",
16
- "rusentiframes-v2_0.zip": "https://www.dropbox.com/s/slbyma7eudmmugp/rusentiframes-v2_0.zip?dl=1",
17
- # RuAttitudes-v1.0 (Many variations)
18
- "ruattitudes-dbg.zip": "https://www.dropbox.com/s/5lmqw9kyb4tfm94/ruattitudes-dbg.zip?dl=1",
19
- "ruattitudes-v1_0.zip": "https://www.dropbox.com/s/wg6oa447msdytj3/ruattitudes-v1_0.zip?dl=1",
20
- "ruattitudes-v1_1.zip": "https://www.dropbox.com/s/e3menx5iqyush19/ruattitudes-v1_1.zip?dl=1",
21
- # RuAttitudes-v2.0 Base
22
- "ruattitudes-v2_0_base.zip": "https://www.dropbox.com/s/y39vqzzjumqhce1/ruattitudes_20_base.zip?dl=1",
23
- "ruattitudes-v2_0_base_neut.zip": "https://www.dropbox.com/s/3xh7gd004oyuwx5/ruattitudes_20_base_neut.zip?dl=1",
24
- # RuAttitudes-v2.0 Large
25
- "ruattitudes-v2_0_large.zip": "https://www.dropbox.com/s/43iqoxlyh38qk8u/ruattitudes_20_large.zip?dl=1",
26
- "ruattitudes-v2_0_large_neut.zip": "https://www.dropbox.com/s/6edqsxehtus4c61/ruattitudes_20_large_neut.zip?dl=1",
27
- # SentiNEREL
28
- "sentinerel-v1_0.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v1_0.zip?dl=1",
29
- "sentinerel-v2_0.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v2_0.zip?dl=1",
30
- "sentinerel-v2_1.zip": "https://www.dropbox.com/s/<HIDDEN>/sentinerel-v2_1.zip?dl=1",
31
- # NEREL
32
- "nerel-v1_0.zip": "https://www.dropbox.com/scl/fi/vegk0aczjdm9km410loqv/nerel-v1_0.zip?rlkey=wv0ut86n3x5ao6xabsaxd7lh7&dl=1",
33
- "nerel-v1_1.zip": "https://www.dropbox.com/scl/fi/oaytj0rvx7vhdxjk98x7g/nerel-v1_1.zip?rlkey=klrq0l5rpn10cf7e2swkay6r4&dl=1",
34
- # NEREL-BIO
35
- "nerel-bio-v1_0.zip": "https://www.dropbox.com/scl/fi/nltuulfixbkhg3raczash/nerel-bio-v1_0.zip?rlkey=86uizq1hbkgkx302c5p5znpp6&dl=1"
36
- }
37
-
38
- # Perform downloading ...
39
- for local_name, url_link in data.items():
40
- utils.download(dest_file_path=join(root_dir, local_name),
41
- source_url=url_link)
File without changes
@@ -1,55 +0,0 @@
1
- from arekit.common.entities.collection import EntityCollection
2
- from arekit.common.synonyms.grouping import SynonymsCollectionValuesGroupingProviders
3
- from arekit.contrib.source.brat.annot import BratAnnotationParser
4
- from arekit.contrib.source.brat.entities.entity import BratEntity
5
- from arekit.contrib.source.nerel.io_utils import NerelIOUtils
6
- from arekit.contrib.utils.processing.lemmatization.mystem import MystemWrapper
7
- from arekit.contrib.utils.synonyms.stemmer_based import StemmerBasedSynonymCollection
8
-
9
-
10
- class NerelEntityCollection(EntityCollection):
11
-
12
- def __init__(self, contents, value_to_group_id_func, entities_to_ignore=None):
13
- """
14
- entities_to_ignore: list or None
15
- this parameter is required because of the simplified implementation of
16
- the nested objects of the BRAT annotation.
17
- """
18
- assert(isinstance(contents, dict))
19
- assert(BratAnnotationParser.ENTITIES in contents)
20
- assert(isinstance(entities_to_ignore, list) or entities_to_ignore is None)
21
-
22
- self.__discard_entities = set([] if entities_to_ignore is None else entities_to_ignore)
23
- contents[BratAnnotationParser.ENTITIES] = [e for e in contents[BratAnnotationParser.ENTITIES]
24
- if self.__keep_entity(e)]
25
-
26
- super(NerelEntityCollection, self).__init__(
27
- entities=contents[BratAnnotationParser.ENTITIES],
28
- value_to_group_id_func=value_to_group_id_func)
29
-
30
- self._sort_entities(key=lambda entity: entity.IndexBegin)
31
-
32
- def __keep_entity(self, entity):
33
- assert(isinstance(entity, BratEntity))
34
- return entity.Type not in self.__discard_entities
35
-
36
- @classmethod
37
- def read_collection(cls, filename, version, io_utils, entities_to_ignore=None):
38
- assert(isinstance(io_utils, NerelIOUtils))
39
- assert(isinstance(filename, str))
40
-
41
- # Since this dataset does not provide the synonyms collection by default,
42
- # it is necessary to declare an empty collection to populate so in further.
43
- synonyms = StemmerBasedSynonymCollection(stemmer=MystemWrapper(), is_read_only=False)
44
-
45
- doc_fold = io_utils.map_doc_to_fold_type(version)
46
-
47
- return io_utils.read_from_zip(
48
- inner_path=io_utils.get_annotation_innerpath(folding_data_type=doc_fold[filename], filename=filename),
49
- process_func=lambda input_file: cls(
50
- contents=BratAnnotationParser.parse_annotations(input_file=input_file, encoding='utf-8-sig'),
51
- entities_to_ignore=entities_to_ignore,
52
- value_to_group_id_func=lambda value:
53
- SynonymsCollectionValuesGroupingProviders.provide_existed_or_register_missed_value(
54
- synonyms, value)),
55
- version=version)
File without changes