arekit 0.24.0__py3-none-any.whl → 0.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arekit/common/data/storages/base.py +4 -15
- arekit/common/docs/entities_grouping.py +2 -1
- arekit/common/docs/parser.py +27 -22
- arekit/common/pipeline/base.py +12 -16
- arekit/common/pipeline/batching.py +28 -0
- arekit/common/pipeline/context.py +5 -1
- arekit/common/pipeline/items/base.py +39 -2
- arekit/common/pipeline/items/flatten.py +5 -1
- arekit/common/pipeline/items/handle.py +2 -1
- arekit/common/pipeline/items/iter.py +2 -1
- arekit/common/pipeline/items/map.py +2 -1
- arekit/common/pipeline/items/map_nested.py +4 -0
- arekit/common/pipeline/utils.py +32 -0
- arekit/common/text/{partitioning/str.py → partitioning.py} +14 -9
- arekit/common/utils.py +11 -52
- arekit/contrib/utils/data/contents/opinions.py +13 -3
- arekit/contrib/utils/data/storages/jsonl_based.py +2 -1
- arekit/contrib/utils/data/storages/pandas_based.py +2 -17
- arekit/contrib/utils/data/storages/row_cache.py +8 -2
- arekit/contrib/utils/data/storages/sqlite_based.py +18 -0
- arekit/contrib/utils/io_utils/utils.py +1 -18
- arekit/contrib/utils/pipelines/items/text/entities_default.py +2 -2
- arekit/contrib/utils/pipelines/items/text/frames.py +2 -3
- arekit/contrib/utils/pipelines/text_opinion/extraction.py +8 -10
- arekit-0.25.1.data/data/logo.png +0 -0
- arekit-0.25.1.dist-info/METADATA +81 -0
- arekit-0.25.1.dist-info/RECORD +186 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/WHEEL +1 -1
- arekit/common/data/input/repositories/__init__.py +0 -0
- arekit/common/data/input/repositories/base.py +0 -68
- arekit/common/data/input/repositories/sample.py +0 -22
- arekit/common/data/views/__init__.py +0 -0
- arekit/common/data/views/samples.py +0 -26
- arekit/common/docs/objects_parser.py +0 -37
- arekit/common/text/parser.py +0 -12
- arekit/common/text/partitioning/__init__.py +0 -0
- arekit/common/text/partitioning/base.py +0 -4
- arekit/common/text/partitioning/terms.py +0 -35
- arekit/contrib/networks/__init__.py +0 -0
- arekit/contrib/networks/embedding.py +0 -149
- arekit/contrib/networks/embedding_io.py +0 -18
- arekit/contrib/networks/input/__init__.py +0 -0
- arekit/contrib/networks/input/const.py +0 -6
- arekit/contrib/networks/input/ctx_serialization.py +0 -28
- arekit/contrib/networks/input/embedding/__init__.py +0 -0
- arekit/contrib/networks/input/embedding/matrix.py +0 -29
- arekit/contrib/networks/input/embedding/offsets.py +0 -55
- arekit/contrib/networks/input/formatters/__init__.py +0 -0
- arekit/contrib/networks/input/formatters/pos_mapper.py +0 -22
- arekit/contrib/networks/input/providers/__init__.py +0 -0
- arekit/contrib/networks/input/providers/sample.py +0 -129
- arekit/contrib/networks/input/providers/term_connotation.py +0 -23
- arekit/contrib/networks/input/providers/text.py +0 -24
- arekit/contrib/networks/input/rows_parser.py +0 -47
- arekit/contrib/networks/input/term_types.py +0 -13
- arekit/contrib/networks/input/terms_mapping.py +0 -60
- arekit/contrib/networks/vectorizer.py +0 -6
- arekit/contrib/source/__init__.py +0 -0
- arekit/contrib/source/brat/__init__.py +0 -0
- arekit/contrib/source/brat/annot.py +0 -84
- arekit/contrib/source/brat/doc.py +0 -28
- arekit/contrib/source/brat/entities/__init__.py +0 -0
- arekit/contrib/source/brat/entities/compound.py +0 -13
- arekit/contrib/source/brat/entities/entity.py +0 -42
- arekit/contrib/source/brat/entities/parser.py +0 -53
- arekit/contrib/source/brat/opinions/__init__.py +0 -0
- arekit/contrib/source/brat/opinions/converter.py +0 -19
- arekit/contrib/source/brat/relation.py +0 -32
- arekit/contrib/source/brat/sentence.py +0 -69
- arekit/contrib/source/brat/sentences_reader.py +0 -128
- arekit/contrib/source/download.py +0 -41
- arekit/contrib/source/nerel/__init__.py +0 -0
- arekit/contrib/source/nerel/entities.py +0 -55
- arekit/contrib/source/nerel/folding/__init__.py +0 -0
- arekit/contrib/source/nerel/folding/fixed.py +0 -74
- arekit/contrib/source/nerel/io_utils.py +0 -62
- arekit/contrib/source/nerel/labels.py +0 -241
- arekit/contrib/source/nerel/reader.py +0 -46
- arekit/contrib/source/nerel/utils.py +0 -24
- arekit/contrib/source/nerel/versions.py +0 -12
- arekit/contrib/source/nerelbio/__init__.py +0 -0
- arekit/contrib/source/nerelbio/io_utils.py +0 -62
- arekit/contrib/source/nerelbio/labels.py +0 -265
- arekit/contrib/source/nerelbio/reader.py +0 -8
- arekit/contrib/source/nerelbio/versions.py +0 -8
- arekit/contrib/source/ruattitudes/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/collection.py +0 -36
- arekit/contrib/source/ruattitudes/doc.py +0 -51
- arekit/contrib/source/ruattitudes/doc_brat.py +0 -44
- arekit/contrib/source/ruattitudes/entity/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/entity/parser.py +0 -7
- arekit/contrib/source/ruattitudes/io_utils.py +0 -56
- arekit/contrib/source/ruattitudes/labels_fmt.py +0 -12
- arekit/contrib/source/ruattitudes/opinions/__init__.py +0 -0
- arekit/contrib/source/ruattitudes/opinions/base.py +0 -28
- arekit/contrib/source/ruattitudes/opinions/converter.py +0 -37
- arekit/contrib/source/ruattitudes/reader.py +0 -268
- arekit/contrib/source/ruattitudes/sentence.py +0 -73
- arekit/contrib/source/ruattitudes/synonyms.py +0 -17
- arekit/contrib/source/ruattitudes/text_object.py +0 -59
- arekit/contrib/source/rusentiframes/__init__.py +0 -0
- arekit/contrib/source/rusentiframes/collection.py +0 -157
- arekit/contrib/source/rusentiframes/effect.py +0 -24
- arekit/contrib/source/rusentiframes/io_utils.py +0 -19
- arekit/contrib/source/rusentiframes/labels_fmt.py +0 -22
- arekit/contrib/source/rusentiframes/polarity.py +0 -35
- arekit/contrib/source/rusentiframes/role.py +0 -15
- arekit/contrib/source/rusentiframes/state.py +0 -24
- arekit/contrib/source/rusentiframes/types.py +0 -42
- arekit/contrib/source/rusentiframes/value.py +0 -2
- arekit/contrib/source/rusentrel/__init__.py +0 -0
- arekit/contrib/source/rusentrel/const.py +0 -3
- arekit/contrib/source/rusentrel/docs_reader.py +0 -51
- arekit/contrib/source/rusentrel/entities.py +0 -26
- arekit/contrib/source/rusentrel/io_utils.py +0 -125
- arekit/contrib/source/rusentrel/labels_fmt.py +0 -12
- arekit/contrib/source/rusentrel/opinions/__init__.py +0 -0
- arekit/contrib/source/rusentrel/opinions/collection.py +0 -30
- arekit/contrib/source/rusentrel/opinions/converter.py +0 -40
- arekit/contrib/source/rusentrel/opinions/provider.py +0 -54
- arekit/contrib/source/rusentrel/opinions/writer.py +0 -42
- arekit/contrib/source/rusentrel/synonyms.py +0 -17
- arekit/contrib/source/sentinerel/__init__.py +0 -0
- arekit/contrib/source/sentinerel/entities.py +0 -52
- arekit/contrib/source/sentinerel/folding/__init__.py +0 -0
- arekit/contrib/source/sentinerel/folding/factory.py +0 -31
- arekit/contrib/source/sentinerel/folding/fixed.py +0 -70
- arekit/contrib/source/sentinerel/io_utils.py +0 -87
- arekit/contrib/source/sentinerel/labels.py +0 -53
- arekit/contrib/source/sentinerel/labels_scaler.py +0 -30
- arekit/contrib/source/sentinerel/reader.py +0 -42
- arekit/contrib/source/synonyms/__init__.py +0 -0
- arekit/contrib/source/synonyms/utils.py +0 -19
- arekit/contrib/source/zip_utils.py +0 -47
- arekit/contrib/utils/connotations/__init__.py +0 -0
- arekit/contrib/utils/connotations/rusentiframes_sentiment.py +0 -23
- arekit/contrib/utils/data/readers/__init__.py +0 -0
- arekit/contrib/utils/data/readers/base.py +0 -7
- arekit/contrib/utils/data/readers/csv_pd.py +0 -38
- arekit/contrib/utils/data/readers/jsonl.py +0 -15
- arekit/contrib/utils/data/service/__init__.py +0 -0
- arekit/contrib/utils/data/service/balance.py +0 -50
- arekit/contrib/utils/data/writers/csv_native.py +0 -63
- arekit/contrib/utils/data/writers/csv_pd.py +0 -40
- arekit/contrib/utils/data/writers/json_opennre.py +0 -132
- arekit/contrib/utils/data/writers/sqlite_native.py +0 -110
- arekit/contrib/utils/download.py +0 -77
- arekit/contrib/utils/embeddings/__init__.py +0 -0
- arekit/contrib/utils/embeddings/rusvectores.py +0 -58
- arekit/contrib/utils/embeddings/tokens.py +0 -30
- arekit/contrib/utils/io_utils/embedding.py +0 -72
- arekit/contrib/utils/io_utils/opinions.py +0 -37
- arekit/contrib/utils/io_utils/samples.py +0 -79
- arekit/contrib/utils/lexicons/__init__.py +0 -0
- arekit/contrib/utils/lexicons/lexicon.py +0 -41
- arekit/contrib/utils/lexicons/relation.py +0 -42
- arekit/contrib/utils/lexicons/rusentilex.py +0 -37
- arekit/contrib/utils/nn/__init__.py +0 -0
- arekit/contrib/utils/nn/rows.py +0 -83
- arekit/contrib/utils/np_utils/__init__.py +0 -0
- arekit/contrib/utils/np_utils/embedding.py +0 -22
- arekit/contrib/utils/np_utils/npz_utils.py +0 -13
- arekit/contrib/utils/np_utils/vocab.py +0 -20
- arekit/contrib/utils/pipelines/items/sampling/__init__.py +0 -0
- arekit/contrib/utils/pipelines/items/sampling/base.py +0 -99
- arekit/contrib/utils/pipelines/items/sampling/networks.py +0 -54
- arekit/contrib/utils/pipelines/items/text/frames_lemmatized.py +0 -36
- arekit/contrib/utils/pipelines/items/text/frames_negation.py +0 -32
- arekit/contrib/utils/pipelines/items/text/terms_splitter.py +0 -10
- arekit/contrib/utils/pipelines/items/text/tokenizer.py +0 -107
- arekit/contrib/utils/pipelines/items/text/translator.py +0 -135
- arekit/contrib/utils/pipelines/sources/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel/doc_provider.py +0 -27
- arekit/contrib/utils/pipelines/sources/nerel/extract_text_relations.py +0 -65
- arekit/contrib/utils/pipelines/sources/nerel/labels_fmt.py +0 -60
- arekit/contrib/utils/pipelines/sources/nerel_bio/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/nerel_bio/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/nerel_bio/extrat_text_relations.py +0 -64
- arekit/contrib/utils/pipelines/sources/nerel_bio/labels_fmt.py +0 -79
- arekit/contrib/utils/pipelines/sources/ruattitudes/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/ruattitudes/doc_provider.py +0 -56
- arekit/contrib/utils/pipelines/sources/ruattitudes/entity_filter.py +0 -20
- arekit/contrib/utils/pipelines/sources/ruattitudes/extract_text_opinions.py +0 -65
- arekit/contrib/utils/pipelines/sources/rusentrel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/rusentrel/doc_provider.py +0 -21
- arekit/contrib/utils/pipelines/sources/rusentrel/extract_text_opinions.py +0 -107
- arekit/contrib/utils/pipelines/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/pipelines/sources/sentinerel/doc_provider.py +0 -29
- arekit/contrib/utils/pipelines/sources/sentinerel/entity_filter.py +0 -62
- arekit/contrib/utils/pipelines/sources/sentinerel/extract_text_opinions.py +0 -180
- arekit/contrib/utils/pipelines/sources/sentinerel/labels_fmt.py +0 -50
- arekit/contrib/utils/pipelines/text_opinion/annot/predefined.py +0 -88
- arekit/contrib/utils/processing/languages/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/mods.py +0 -12
- arekit/contrib/utils/processing/languages/pos.py +0 -23
- arekit/contrib/utils/processing/languages/ru/__init__.py +0 -0
- arekit/contrib/utils/processing/languages/ru/cases.py +0 -78
- arekit/contrib/utils/processing/languages/ru/constants.py +0 -6
- arekit/contrib/utils/processing/languages/ru/mods.py +0 -13
- arekit/contrib/utils/processing/languages/ru/number.py +0 -23
- arekit/contrib/utils/processing/languages/ru/pos_service.py +0 -36
- arekit/contrib/utils/processing/lemmatization/__init__.py +0 -0
- arekit/contrib/utils/processing/lemmatization/mystem.py +0 -51
- arekit/contrib/utils/processing/pos/__init__.py +0 -0
- arekit/contrib/utils/processing/pos/base.py +0 -12
- arekit/contrib/utils/processing/pos/mystem_wrap.py +0 -134
- arekit/contrib/utils/processing/pos/russian.py +0 -10
- arekit/contrib/utils/processing/text/__init__.py +0 -0
- arekit/contrib/utils/processing/text/tokens.py +0 -127
- arekit/contrib/utils/resources.py +0 -25
- arekit/contrib/utils/serializer.py +0 -43
- arekit/contrib/utils/sources/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/__init__.py +0 -0
- arekit/contrib/utils/sources/sentinerel/text_opinion/prof_per_org_filter.py +0 -63
- arekit/contrib/utils/vectorizers/__init__.py +0 -0
- arekit/contrib/utils/vectorizers/bpe.py +0 -93
- arekit/contrib/utils/vectorizers/random_norm.py +0 -39
- arekit/download_data.py +0 -11
- arekit-0.24.0.dist-info/METADATA +0 -23
- arekit-0.24.0.dist-info/RECORD +0 -374
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/LICENSE +0 -0
- {arekit-0.24.0.dist-info → arekit-0.25.1.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,12 @@
|
|
|
1
1
|
from collections.abc import Iterable
|
|
2
2
|
import logging
|
|
3
|
-
from os.path import
|
|
4
|
-
|
|
5
|
-
from arekit.common.experiment.data_type import DataType
|
|
3
|
+
from os.path import exists
|
|
6
4
|
|
|
7
5
|
|
|
8
6
|
logger = logging.getLogger(__name__)
|
|
9
7
|
logging.basicConfig(level=logging.INFO)
|
|
10
8
|
|
|
11
9
|
|
|
12
|
-
def join_dir_with_subfolder_name(subfolder_name, dir):
|
|
13
|
-
""" Returns subfolder in in directory
|
|
14
|
-
"""
|
|
15
|
-
assert(isinstance(subfolder_name, str))
|
|
16
|
-
assert(isinstance(dir, str))
|
|
17
|
-
|
|
18
|
-
target_dir = join(dir, "{}/".format(subfolder_name))
|
|
19
|
-
return target_dir
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def filename_template(data_type):
|
|
23
|
-
assert(isinstance(data_type, DataType))
|
|
24
|
-
return "{data_type}-0".format(data_type=data_type.name.lower())
|
|
25
|
-
|
|
26
|
-
|
|
27
10
|
def check_targets_existence(targets):
|
|
28
11
|
assert (isinstance(targets, Iterable))
|
|
29
12
|
|
|
@@ -4,8 +4,8 @@ from arekit.common.pipeline.items.base import BasePipelineItem
|
|
|
4
4
|
|
|
5
5
|
class TextEntitiesParser(BasePipelineItem):
|
|
6
6
|
|
|
7
|
-
def __init__(self):
|
|
8
|
-
super(TextEntitiesParser, self).__init__()
|
|
7
|
+
def __init__(self, **kwargs):
|
|
8
|
+
super(TextEntitiesParser, self).__init__(**kwargs)
|
|
9
9
|
|
|
10
10
|
@staticmethod
|
|
11
11
|
def __process_word(word):
|
|
@@ -6,11 +6,10 @@ from arekit.common.pipeline.items.base import BasePipelineItem
|
|
|
6
6
|
|
|
7
7
|
class FrameVariantsParser(BasePipelineItem):
|
|
8
8
|
|
|
9
|
-
def __init__(self, frame_variants):
|
|
9
|
+
def __init__(self, frame_variants, **kwargs):
|
|
10
10
|
assert(isinstance(frame_variants, FrameVariantsCollection))
|
|
11
11
|
assert(len(frame_variants) > 0)
|
|
12
|
-
|
|
13
|
-
super(FrameVariantsParser, self).__init__()
|
|
12
|
+
super(FrameVariantsParser, self).__init__(**kwargs)
|
|
14
13
|
|
|
15
14
|
self.__frame_variants = frame_variants
|
|
16
15
|
self.__max_variant_len = max([len(variant) for _, variant in frame_variants.iter_variants()])
|
|
@@ -3,12 +3,10 @@ from arekit.common.linkage.text_opinions import TextOpinionsLinkage
|
|
|
3
3
|
from arekit.common.docs.parsed.base import ParsedDocument
|
|
4
4
|
from arekit.common.docs.parsed.providers.entity_service import EntityServiceProvider
|
|
5
5
|
from arekit.common.docs.parsed.service import ParsedDocumentService
|
|
6
|
-
from arekit.common.docs.parser import
|
|
7
|
-
from arekit.common.pipeline.base import BasePipeline
|
|
6
|
+
from arekit.common.docs.parser import DocumentParsers
|
|
8
7
|
from arekit.common.pipeline.items.flatten import FlattenIterPipelineItem
|
|
9
8
|
from arekit.common.pipeline.items.map import MapPipelineItem
|
|
10
9
|
from arekit.common.pipeline.items.map_nested import MapNestedPipelineItem
|
|
11
|
-
from arekit.common.text.parser import BaseTextParser
|
|
12
10
|
from arekit.common.text_opinions.base import TextOpinion
|
|
13
11
|
from arekit.contrib.utils.pipelines.text_opinion.filters.base import TextOpinionFilter
|
|
14
12
|
from arekit.contrib.utils.pipelines.text_opinion.filters.limitation import FrameworkLimitationsTextOpinionFilter
|
|
@@ -17,7 +15,7 @@ from arekit.contrib.utils.pipelines.text_opinion.filters.limitation import Frame
|
|
|
17
15
|
def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
|
|
18
16
|
text_opinion_filters, use_meta):
|
|
19
17
|
""" use_meta: bool
|
|
20
|
-
this is mainly for
|
|
18
|
+
this is mainly for the progress-bar and other console parameters to stay up-to-date
|
|
21
19
|
with the state in the case we do not have that much output results
|
|
22
20
|
across multiple amount of documents.
|
|
23
21
|
"""
|
|
@@ -64,24 +62,24 @@ def __iter_text_opinion_linkages(parsed_doc, annotators, entity_index_func,
|
|
|
64
62
|
yield MetaEmptyLinkedDataWrapper(doc_id=parsed_doc.RelatedDocID)
|
|
65
63
|
|
|
66
64
|
|
|
67
|
-
def text_opinion_extraction_pipeline(
|
|
65
|
+
def text_opinion_extraction_pipeline(pipeline_items, get_doc_by_id_func, annotators, entity_index_func, batch_size,
|
|
68
66
|
text_opinion_filters=None, use_meta_between_docs=True):
|
|
69
|
-
assert(isinstance(text_parser, BaseTextParser))
|
|
70
67
|
assert(callable(get_doc_by_id_func))
|
|
71
68
|
assert(isinstance(annotators, list))
|
|
72
69
|
assert(isinstance(text_opinion_filters, list) or text_opinion_filters is None)
|
|
73
70
|
assert(isinstance(use_meta_between_docs, bool))
|
|
71
|
+
assert(isinstance(batch_size, int) and batch_size > 0)
|
|
74
72
|
|
|
75
73
|
extra_filters = [] if text_opinion_filters is None else text_opinion_filters
|
|
76
74
|
actual_text_opinion_filters = [FrameworkLimitationsTextOpinionFilter()] + extra_filters
|
|
77
75
|
|
|
78
|
-
return
|
|
76
|
+
return [
|
|
79
77
|
# (doc_id) -> (doc)
|
|
80
78
|
MapPipelineItem(map_func=lambda doc_id: get_doc_by_id_func(doc_id)),
|
|
81
79
|
|
|
82
80
|
# (doc, ppl_ctx) -> (parsed_doc)
|
|
83
|
-
MapNestedPipelineItem(map_func=lambda doc, ppl_ctx:
|
|
84
|
-
doc=doc,
|
|
81
|
+
MapNestedPipelineItem(map_func=lambda doc, ppl_ctx: DocumentParsers.parse_batch(
|
|
82
|
+
doc=doc, pipeline_items=pipeline_items, parent_ppl_ctx=ppl_ctx, batch_size=batch_size)),
|
|
85
83
|
|
|
86
84
|
# (parsed_doc) -> (text_opinions)
|
|
87
85
|
MapPipelineItem(map_func=lambda parsed_doc: __iter_text_opinion_linkages(
|
|
@@ -90,4 +88,4 @@ def text_opinion_extraction_pipeline(text_parser, get_doc_by_id_func, annotators
|
|
|
90
88
|
|
|
91
89
|
# linkages[] -> linkages
|
|
92
90
|
FlattenIterPipelineItem()
|
|
93
|
-
]
|
|
91
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: arekit
|
|
3
|
+
Version: 0.25.1
|
|
4
|
+
Summary: Document level Attitude and Relation Extraction toolkit (AREkit) for sampling and prompting mass-media news into datasets for ML-model training
|
|
5
|
+
Home-page: https://github.com/nicolay-r/AREkit
|
|
6
|
+
Author: Nicolay Rusnachenko
|
|
7
|
+
Author-email: rusnicolay@gmail.com
|
|
8
|
+
License: MIT License
|
|
9
|
+
Keywords: natural language processing,relation extraction,sentiment analysis
|
|
10
|
+
Classifier: Programming Language :: Python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Requires-Python: >=3.6
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: tqdm
|
|
19
|
+
Requires-Dist: enum34==1.1.10
|
|
20
|
+
Requires-Dist: numpy>=1.14.5
|
|
21
|
+
|
|
22
|
+
# AREkit 0.25.1
|
|
23
|
+
|
|
24
|
+

|
|
25
|
+
|
|
26
|
+
<p align="center">
|
|
27
|
+
<img src="logo.png"/>
|
|
28
|
+
</p>
|
|
29
|
+
|
|
30
|
+
**AREkit** (Attitude and Relation Extraction Toolkit) --
|
|
31
|
+
is a python toolkit, devoted to document level Attitude and Relation Extraction between text objects from mass-media news.
|
|
32
|
+
|
|
33
|
+
## Description
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
This toolkit aims at memory-effective data processing in [Relation Extraction (RE)](https://nlpprogress.com/english/relationship_extraction.html) related tasks.
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<img src="docs/arekit-pipeline-concept.png"/>
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
> Figure: AREkit pipelines design. More on
|
|
43
|
+
> **[ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction](https://link.springer.com/chapter/10.1007/978-3-031-56069-9_23)** paper
|
|
44
|
+
|
|
45
|
+
In particular, this framework serves the following features:
|
|
46
|
+
* ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
|
|
47
|
+
* 🔗 EL (entity-linking) API support for objects,
|
|
48
|
+
* ➰ avoidance of cyclic connections,
|
|
49
|
+
* :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
|
|
50
|
+
* 📑 relations annotations and filtering rules,
|
|
51
|
+
* *️⃣ entities formatting or masking, and more.
|
|
52
|
+
|
|
53
|
+
The core functionality includes:
|
|
54
|
+
* API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support
|
|
55
|
+
for sentence level relations preparation (dubbed as contexts);
|
|
56
|
+
* API for contexts extraction;
|
|
57
|
+
* Relations transferring from sentence-level onto document-level, and more.
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install git+https://github.com/nicolay-r/AREkit.git@0.25.1-rc
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
Please follow the **[tutorial section on project Wiki](https://github.com/nicolay-r/AREkit/wiki/Tutorials)** for mode details.
|
|
68
|
+
|
|
69
|
+
## How to cite
|
|
70
|
+
A great research is also accompanied by the faithful reference.
|
|
71
|
+
if you use or extend our work, please cite as follows:
|
|
72
|
+
|
|
73
|
+
```bibtex
|
|
74
|
+
@inproceedings{rusnachenko2024arelight,
|
|
75
|
+
title={ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction},
|
|
76
|
+
author={Rusnachenko, Nicolay and Liang, Huizhi and Kolomeets, Maxim and Shi, Lei},
|
|
77
|
+
booktitle={European Conference on Information Retrieval},
|
|
78
|
+
year={2024},
|
|
79
|
+
organization={Springer}
|
|
80
|
+
}
|
|
81
|
+
```
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
arekit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
arekit/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
arekit/common/bound.py,sha256=lPpHY6ct_CU9e4qXeYjhJfWbTj6Sb_NVtZ1CJheQPNE,1402
|
|
4
|
+
arekit/common/log_utils.py,sha256=OfEQxbExkuRAl9dxlgFEqcFhI4HHoMYT7WE8ud0IPOM,924
|
|
5
|
+
arekit/common/utils.py,sha256=N061ENJJgvsB338Q9cixc6RWyuikSPQq4Tc8mmgwy9s,2659
|
|
6
|
+
arekit/common/context/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
arekit/common/context/terms_mapper.py,sha256=QA02Cv7D2JKTlXkez_0w0J8HuvNziNF2vrqLgy4Bwc8,1447
|
|
8
|
+
arekit/common/context/token.py,sha256=CpWAlvprUnJfCtYvO8lwdfU_ofSKAOGOudXTwppyzSk,459
|
|
9
|
+
arekit/common/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
arekit/common/data/const.py,sha256=J74zim3CGJlLJp-AVn5z9TTuBfmttjiM_8sRW1Pc-iE,457
|
|
11
|
+
arekit/common/data/doc_provider.py,sha256=KU6Q2-B8_cUuFhSBHYp-cDI8OCwFk3fwOahv2QLIR2c,149
|
|
12
|
+
arekit/common/data/rows_fmt.py,sha256=klq9HdzSnhbRBhOw7O4ctp3PZ5L6ZVy-0eIV2vLLYY8,2694
|
|
13
|
+
arekit/common/data/rows_parser.py,sha256=qYSEETvhX_0_JuAqm0bjK_V28_53qq7OY9JAnBdRC78,1513
|
|
14
|
+
arekit/common/data/input/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
arekit/common/data/input/sample.py,sha256=6JeGxsLbEUXVKPWA1hIlkTDNOaYg4bHCJWw0ULrLByg,2143
|
|
16
|
+
arekit/common/data/input/terms_mapper.py,sha256=DUOMbGwiQETY7qhztoU8uU30d1cQPsIsgNLldpjcufg,3197
|
|
17
|
+
arekit/common/data/input/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
+
arekit/common/data/input/providers/const.py,sha256=GDvPkgP7hllHW3QiueMBQgQyu2CtNFI4JYNNja2Im6Q,187
|
|
19
|
+
arekit/common/data/input/providers/contents.py,sha256=jT1LJE_5Igw5H2e1jKsWWciHSbPVg649phT177SzhEA,261
|
|
20
|
+
arekit/common/data/input/providers/columns/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
arekit/common/data/input/providers/columns/base.py,sha256=Ar4GkC1L8YFMgeVNM-pIkOOUvKqf2CgIIdh5DA0V8uI,225
|
|
22
|
+
arekit/common/data/input/providers/columns/sample.py,sha256=3onDT6LGkFwU3GOAm6M1MvgjD3fEgapTslAV6-9gvIE,1756
|
|
23
|
+
arekit/common/data/input/providers/instances/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
arekit/common/data/input/providers/instances/base.py,sha256=ybaHQNRpuebdHGU937yzkgZ0E7mO-S7Dm41NwFj44ew,420
|
|
25
|
+
arekit/common/data/input/providers/instances/multiple.py,sha256=6agaTA3srLiLEhBTU0RnD01GUFqMcsITV5NjVkUgR10,1144
|
|
26
|
+
arekit/common/data/input/providers/instances/single.py,sha256=bZKIn_Kw79c8pH1a3aUq1dmOsDu__BoFwQDLGjEtg5I,253
|
|
27
|
+
arekit/common/data/input/providers/label/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
arekit/common/data/input/providers/label/base.py,sha256=1MOCKw_OP_IbYT5OR5C3b9VZdYnLGg-TxPc_qHpuZJs,620
|
|
29
|
+
arekit/common/data/input/providers/label/binary.py,sha256=jPD6Jn8DYMrdI3jN8ueoWvuGMouUKbelmI07sP9Wau4,337
|
|
30
|
+
arekit/common/data/input/providers/label/multiple.py,sha256=HWbHF_CwwbiLQbYm5dgvnXAm0b6tJOyFYFEUBxuWAqI,492
|
|
31
|
+
arekit/common/data/input/providers/rows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
|
+
arekit/common/data/input/providers/rows/base.py,sha256=syH7ZEW3Agwfb1IR0G7n_Amy3Kkg0EZk2V7kH3r7ADg,2517
|
|
33
|
+
arekit/common/data/input/providers/rows/samples.py,sha256=uqLTP8fnz-0wC7ALLlIDUYtXTG4OpnRqp70Fgv_1Iiw,9427
|
|
34
|
+
arekit/common/data/input/providers/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
|
+
arekit/common/data/input/providers/sample/cropped.py,sha256=jJSos4Si-qy-wb-QmomXxxgURR1UhJnvY0tZoowlfVc,1885
|
|
36
|
+
arekit/common/data/input/providers/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
arekit/common/data/input/providers/text/single.py,sha256=vm3sShIYZcmses-hmZX9cOfveWXCYGwvKLgQ0qs3VXQ,1604
|
|
38
|
+
arekit/common/data/storages/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
arekit/common/data/storages/base.py,sha256=psxo5uIc3hUDi5Cgf4j3Cm-935Fy1VQBYzcBzCcCFZE,2661
|
|
40
|
+
arekit/common/docs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
+
arekit/common/docs/base.py,sha256=uXUOtpR9BEsDBfDHg4eLqOjfSVOV_o9VPii3nSxLZuY,734
|
|
42
|
+
arekit/common/docs/entities_grouping.py,sha256=_r254fNr0j6BjHuLZBLjj21yWm4_k__5aOcBXcAaQUQ,704
|
|
43
|
+
arekit/common/docs/entity.py,sha256=TxrZMdIEgjk-PgCyskCkVis2KAw_M7vTBp3ppP6G05M,662
|
|
44
|
+
arekit/common/docs/parser.py,sha256=dzWjpbbYt-C9UU9sSy_Holnm0kQxJqtz1_6va6kS_L4,1780
|
|
45
|
+
arekit/common/docs/sentence.py,sha256=nZCCFj2yk71POoXCBfEMN3pteM2qQdj60eEzxMVY_3k,302
|
|
46
|
+
arekit/common/docs/parsed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
|
+
arekit/common/docs/parsed/base.py,sha256=WPstqOpBuLKjtz6UO_bI0DpOPF3Sm0wYEVwjtldbPXE,3175
|
|
48
|
+
arekit/common/docs/parsed/service.py,sha256=fSzwtRcSvmvlW8LyK6XPf7wJAx66GWlbRgH_3oQf-BU,1029
|
|
49
|
+
arekit/common/docs/parsed/term_position.py,sha256=H9eQQeanLxwP6og30TQUnpcXymGEPwXClRpaE8VnpLs,1040
|
|
50
|
+
arekit/common/docs/parsed/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
|
+
arekit/common/docs/parsed/providers/base.py,sha256=IjnG7c6Q78cYYAPTrwuZCOiMQDfMaujDQ6U0gK7JCcw,2587
|
|
52
|
+
arekit/common/docs/parsed/providers/base_pairs.py,sha256=RDYjspkENPQU2pn7Jp5mFrL9566eVWgXMEzWBQlMdRo,2195
|
|
53
|
+
arekit/common/docs/parsed/providers/entity_service.py,sha256=oaBfferpkDXfAFL17vpecSZUsV1Pjvq6lqgHDHsIEZY,6657
|
|
54
|
+
arekit/common/docs/parsed/providers/opinion_pairs.py,sha256=ibeFmvpMBBARtqQ3EKEocIOulgzavv0DeYxePGQK5-U,633
|
|
55
|
+
arekit/common/docs/parsed/providers/text_opinion_pairs.py,sha256=BC4uVgFxy3oZTkCq9VgOlqoqhODia2Z3anoGyGoy0ao,3139
|
|
56
|
+
arekit/common/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
|
+
arekit/common/entities/base.py,sha256=kpJFo4pCRVBQX6T8PibLKspp9UwoIrkHDoFMTM9KkUs,1646
|
|
58
|
+
arekit/common/entities/collection.py,sha256=ySSriMYP6zzdto1mC0V9VPXmkAqyJN3mmGoqoNValGI,1931
|
|
59
|
+
arekit/common/entities/str_fmt.py,sha256=gAPeS8RXdhh8Px_u5eOAPbtLREiiyMueid0lQoa4EbQ,250
|
|
60
|
+
arekit/common/entities/types.py,sha256=pxFB0gsevdsmnduN_Ffk7_P2TRiMt6NAHyrutuKOFvs,145
|
|
61
|
+
arekit/common/experiment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
|
+
arekit/common/experiment/data_type.py,sha256=DezUkfwLTf6XLYheqPiaWyx3ZwcldsJ8wDV8aNgJtDk,227
|
|
63
|
+
arekit/common/experiment/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
|
+
arekit/common/experiment/api/base_samples_io.py,sha256=SN8CnbEYaazE3SldvnENfjoNRHsTejtrg4jJfqfZLMs,516
|
|
65
|
+
arekit/common/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
+
arekit/common/frames/text_variant.py,sha256=TlWR4jnuF7HW9BMHhOTKkr768V_Ub0wd0E5A4YTwD0c,875
|
|
67
|
+
arekit/common/frames/connotations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
+
arekit/common/frames/connotations/descriptor.py,sha256=yow1Wo-Hf52rx2hiQlpeSkpP4WFFcFB25ewgXtwm588,408
|
|
69
|
+
arekit/common/frames/connotations/provider.py,sha256=Zm-NFL-aVKJM_NhvTWizIAiNENt6B1tegTrj0k2afoc,114
|
|
70
|
+
arekit/common/frames/variants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
+
arekit/common/frames/variants/base.py,sha256=PhFxJZl-g9bGLfg1OlPKIUETAsTx4wwSPuBS5yOEPg8,489
|
|
72
|
+
arekit/common/frames/variants/collection.py,sha256=28_DRBny_iAWMdHpupdCnLvBp0FtF2tjz-uUctyrmhY,1935
|
|
73
|
+
arekit/common/labels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
|
+
arekit/common/labels/base.py,sha256=m7EjvPcQPHtzZ0txVqNXIQPUzgNuaU2FmDyND7K4yTE,412
|
|
75
|
+
arekit/common/labels/str_fmt.py,sha256=ecDsP1-7NNHk_aEaBlPaNaNoA_aqy28QBOHoIxtEnDk,1707
|
|
76
|
+
arekit/common/labels/provider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
|
+
arekit/common/labels/provider/base.py,sha256=KIWvRwXGWNWYhrzEV8A0g9r0Yk7N2E0qQpf9-UpVnbw,151
|
|
78
|
+
arekit/common/labels/provider/constant.py,sha256=bU6DCm1iuk_W2fMkg-NxABMJqgS9DtwxnoHpD_vSnLc,462
|
|
79
|
+
arekit/common/labels/scaler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
80
|
+
arekit/common/labels/scaler/base.py,sha256=FTZ7eTPTVK9IXLcZaXbpidsTTqTjX0-l1Qt-N1bpqWg,2349
|
|
81
|
+
arekit/common/labels/scaler/sentiment.py,sha256=TbYdM9mdtFTQL_fgh9rS9TEc-7U4Fpskp8JvnvN8TAA,180
|
|
82
|
+
arekit/common/labels/scaler/single.py,sha256=tybF3-fO4CHd_QUFnDCEmTbfbljfJA9aZEv9MtpM5Ss,308
|
|
83
|
+
arekit/common/linkage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
84
|
+
arekit/common/linkage/base.py,sha256=toZmKjTr444eHhvNLMSNU23KXtfH5DtOGtS99qGNcOo,1014
|
|
85
|
+
arekit/common/linkage/meta.py,sha256=LFHHhAkzQzym8rha4uuXb0BKwIb61SVGtxnU4iF_Nuk,692
|
|
86
|
+
arekit/common/linkage/opinions.py,sha256=8OQscnh1-5JJL3KX_lCm_6ayGCezDuFnvidfuwkjClI,255
|
|
87
|
+
arekit/common/linkage/text_opinions.py,sha256=qR1-zGEYaVPSpNISnGGXnABpdP6Qx8tc1i5DsEyn9wo,571
|
|
88
|
+
arekit/common/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
89
|
+
arekit/common/model/labeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
90
|
+
arekit/common/model/labeling/base.py,sha256=uj7_igCWEU23OjnzabNy0LyxoZ6S_qSfCA-ZaoL1erA,727
|
|
91
|
+
arekit/common/model/labeling/modes.py,sha256=DiwC6Aomke-ojwwpR2pcd4qgQSwmRdGCvQlyHHhN3YY,127
|
|
92
|
+
arekit/common/model/labeling/single.py,sha256=Eggi0obocjiT9ofv_U0zLiFoEIeUQhaMCqjCWn14Fh8,773
|
|
93
|
+
arekit/common/opinions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
94
|
+
arekit/common/opinions/base.py,sha256=eIx1RzsngCkpnF2Utju5i_Qp7gqF_rDIe_UDeMGXtmo,2112
|
|
95
|
+
arekit/common/opinions/collection.py,sha256=bdx-CIYYdE-DrjyB1mRTGtkLb-lrGPTSLl25xv5EHnM,4938
|
|
96
|
+
arekit/common/opinions/enums.py,sha256=TE5AGN_xb0NdZ636UtHuYFRMNl24iwXzmyf8WUfvr6w,83
|
|
97
|
+
arekit/common/opinions/provider.py,sha256=q4hXRFDuGoo9fGOf_L9CM048YBtel1v3__ZqfSXL8Xc,168
|
|
98
|
+
arekit/common/opinions/writer.py,sha256=-IbWTIVlX2rhLpSP_8iuQ3_WyzzGwhto7ujfnNL6jhA,173
|
|
99
|
+
arekit/common/opinions/annot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
100
|
+
arekit/common/opinions/annot/algo_based.py,sha256=cvDGDmUoUaQ1Xcbyouxrjs0CkHRfRogW8Mfs5O5cOlc,2240
|
|
101
|
+
arekit/common/opinions/annot/base.py,sha256=IvwrwT8O3s6b2_R0arpMR4Uog7kuWQZUAyRP5cq_27A,382
|
|
102
|
+
arekit/common/opinions/annot/algo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
103
|
+
arekit/common/opinions/annot/algo/base.py,sha256=ymll-4-SplCY7CLswjOZEC1vsVHIEzUP0JMYgvL8hbo,124
|
|
104
|
+
arekit/common/opinions/annot/algo/pair_based.py,sha256=HbYn1mAsn5g11NiC9pfrMqNtJn_GzvqPFGpafMqqB2o,4419
|
|
105
|
+
arekit/common/opinions/annot/algo/predefined.py,sha256=zU39SADPKnykHCNB-Bmn_0bvd6gYWWYmfgfi-68hHSs,741
|
|
106
|
+
arekit/common/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
|
+
arekit/common/pipeline/base.py,sha256=8TgWNy5QrnKEp1bq3lhyGSgIfYe5ZIZU3c-DYBJ9LPA,957
|
|
108
|
+
arekit/common/pipeline/batching.py,sha256=DdOvOladOo2aEv3JZ8NQnCvsNGcWk4TFzENrZqTGyXk,1239
|
|
109
|
+
arekit/common/pipeline/context.py,sha256=Fw25lBVakHNAXjtkdEqopR-Jh59cDKGWD2jCJxBrj7Y,1126
|
|
110
|
+
arekit/common/pipeline/conts.py,sha256=NAQNsHt1kK3HnxWv3M6yXi0c7C6Mx6ZZ6KZc0yE0eas,70
|
|
111
|
+
arekit/common/pipeline/utils.py,sha256=5VqH1LtRa4tYUbyiRvWdBmP4biFhTKq9vhr8QiRFFkY,882
|
|
112
|
+
arekit/common/pipeline/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
|
+
arekit/common/pipeline/items/base.py,sha256=15-z8ERQ0QxaRszs7sHQduU0KIBJIm8B0V2nwCva6d0,1695
|
|
114
|
+
arekit/common/pipeline/items/flatten.py,sha256=9T4jWqPGv4UDxajlM0Nm0-gvwUgqqYB8XH0efTum9a0,542
|
|
115
|
+
arekit/common/pipeline/items/handle.py,sha256=QS5Byj7-o5jmFi0ag58NE3zm2-JzVIunIgc3Pn1ij6g,578
|
|
116
|
+
arekit/common/pipeline/items/iter.py,sha256=Tk9WdUMPOq20s7jEWEpU4PmillnVtQ8nIa2ct7iw-3s,406
|
|
117
|
+
arekit/common/pipeline/items/map.py,sha256=G5wBdjaaxePD0pijrxsfpJACeP7kzj7HerjCkNIhmII,381
|
|
118
|
+
arekit/common/pipeline/items/map_nested.py,sha256=vs0GdJNr3qSF9p2yd1nWji5E1HGzECbvOfN2MqoHc2A,630
|
|
119
|
+
arekit/common/synonyms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
120
|
+
arekit/common/synonyms/base.py,sha256=YxD-CKCjlEtar1zTdumnfC3vKgbP2wLODR9mMEwbbnA,4237
|
|
121
|
+
arekit/common/synonyms/grouping.py,sha256=fi7QQbBvsTvvP2CPTesSPEsPNmGfc6euqj-HPhVvtlg,698
|
|
122
|
+
arekit/common/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
123
|
+
arekit/common/text/enums.py,sha256=nelEI7B-szLUtl8xds8Kw_vgK5JWg_Aj7IadEj2q_1Y,141
|
|
124
|
+
arekit/common/text/parsed.py,sha256=YxGRHtozDd3sDVI3hMT_hOO7Wmsy7_zLkblfnSXeJ9g,1104
|
|
125
|
+
arekit/common/text/partitioning.py,sha256=OL8r3-xaMafnT7FuPXDHINlA-BQgx6cLaMqm366WKCU,1153
|
|
126
|
+
arekit/common/text/stemmer.py,sha256=OJ5XelxLN-7m3uLPDU9C7CWdkXDeK-xieexQN6RYLXc,341
|
|
127
|
+
arekit/common/text_opinions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
+
arekit/common/text_opinions/base.py,sha256=KootNvGAbUVCV5uFgLjK-bm9bbQSIvZUz0q9CBToGa8,3447
|
|
129
|
+
arekit/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
130
|
+
arekit/contrib/bert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
|
+
arekit/contrib/bert/input/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
|
+
arekit/contrib/bert/input/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
|
+
arekit/contrib/bert/input/providers/cropped_sample.py,sha256=46uHHhAe8cGxV2JlfO3thog5XV6T2niUIflFghfUSBM,866
|
|
134
|
+
arekit/contrib/bert/input/providers/text_pair.py,sha256=_1d-he0n42y3ksj8RjJlNHgHnaQUEq0aQhUdTPRMKgg,2817
|
|
135
|
+
arekit/contrib/bert/terms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
136
|
+
arekit/contrib/bert/terms/mapper.py,sha256=oHX-lsaZYjBFLjngzSKT5z_JPJCHbclUsEe4i4fup_8,992
|
|
137
|
+
arekit/contrib/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
138
|
+
arekit/contrib/prompt/sample.py,sha256=MxpbDR0ww7WmdtuPu74B8R6QKVXeuzO0CKGOJIYwbRk,3164
|
|
139
|
+
arekit/contrib/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
140
|
+
arekit/contrib/utils/bert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
141
|
+
arekit/contrib/utils/bert/samplers.py,sha256=ZVe3rbUAH0Jw1xR_yHE1DoUJf3CI0pDgbBQQzlLWevc,989
|
|
142
|
+
arekit/contrib/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
143
|
+
arekit/contrib/utils/data/contents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
144
|
+
arekit/contrib/utils/data/contents/opinions.py,sha256=MSV7NytEe15adKhhHCq5KiCj6ZBq31nV-u2rcSfFCgE,1738
|
|
145
|
+
arekit/contrib/utils/data/doc_provider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
|
+
arekit/contrib/utils/data/doc_provider/dict_based.py,sha256=zUOiiIbj5zby4xqMb0m9N-a6enavJJ7wFmPaGErykWU,371
|
|
147
|
+
arekit/contrib/utils/data/doc_provider/dir_based.py,sha256=FTw3kLV_CYtPoUoHl39IrP6RjLvTecCno9May95jVXw,1916
|
|
148
|
+
arekit/contrib/utils/data/storages/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
149
|
+
arekit/contrib/utils/data/storages/jsonl_based.py,sha256=dz8uizu9t1C215o0HEL8y4LiDKR4aC_-OwDu_xF0xIM,522
|
|
150
|
+
arekit/contrib/utils/data/storages/pandas_based.py,sha256=gMkWUFHZE9Oe1Uy04vEBcUfTIAdh46r5zpjlPAwwG2g,3842
|
|
151
|
+
arekit/contrib/utils/data/storages/row_cache.py,sha256=MRK0uJFvw6O99k2aFb3JLZhLUBo2JUO-WYQ4EeRRu6M,2051
|
|
152
|
+
arekit/contrib/utils/data/storages/sqlite_based.py,sha256=cIYAHyiB4CMftKgrgLqw-L4F1WnhbspjwWLSPqH5NHk,682
|
|
153
|
+
arekit/contrib/utils/data/writers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
154
|
+
arekit/contrib/utils/data/writers/base.py,sha256=JLwf5WVl_U319sdMev8YOn4OoCcrgNIUZtrOuG1JLjI,766
|
|
155
|
+
arekit/contrib/utils/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
156
|
+
arekit/contrib/utils/entities/filter.py,sha256=aHTExIMFaMdy4QL8iYE23eiby3qLImAakXR6gNqG6fs,145
|
|
157
|
+
arekit/contrib/utils/entities/formatters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
158
|
+
arekit/contrib/utils/entities/formatters/str_display.py,sha256=N8igv7EVaTFayvLXkyBGtm67KwHaeP-M-L8d7oqBG9Q,401
|
|
159
|
+
arekit/contrib/utils/entities/formatters/str_simple_sharp_prefixed_fmt.py,sha256=rEUIma9O3kOBWIguGtJ69JH-00Dhm0vUBOd5yNcKweY,653
|
|
160
|
+
arekit/contrib/utils/io_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
161
|
+
arekit/contrib/utils/io_utils/utils.py,sha256=310SIJTsNLn2OZrGPer9W4ZP52PHkjBK3zsyqxVs3h0,537
|
|
162
|
+
arekit/contrib/utils/pipelines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
163
|
+
arekit/contrib/utils/pipelines/opinion_collections.py,sha256=y9-klVJGCN9mPd7t1ECllAiCnAb3MKVXC1PnYddp5sQ,3195
|
|
164
|
+
arekit/contrib/utils/pipelines/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
|
+
arekit/contrib/utils/pipelines/items/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
|
+
arekit/contrib/utils/pipelines/items/text/entities_default.py,sha256=vNx5ir2mf7a1gg_OeqUsf_p1Fu2k7QIFxVpe-CuwZ84,727
|
|
167
|
+
arekit/contrib/utils/pipelines/items/text/frames.py,sha256=pZQybYfgEQB1DM3PtmsgrtB2Xl0HejmP4rhT0nR_YKE,2586
|
|
168
|
+
arekit/contrib/utils/pipelines/text_opinion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
169
|
+
arekit/contrib/utils/pipelines/text_opinion/extraction.py,sha256=MT1WMlvVI25JRL0g7W83bV8BGUr7_MNOQBj7ZAHgrnU,4245
|
|
170
|
+
arekit/contrib/utils/pipelines/text_opinion/annot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
|
+
arekit/contrib/utils/pipelines/text_opinion/annot/algo_based.py,sha256=bwS-UR2x3rgp_xqnf6z-73T-eIZE_kltRSGYxgd_WpU,1751
|
|
172
|
+
arekit/contrib/utils/pipelines/text_opinion/filters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
173
|
+
arekit/contrib/utils/pipelines/text_opinion/filters/base.py,sha256=GnKnJB4MKqiMSJny3a9Na7l7Csm7abbt6GADBCY18Mw,143
|
|
174
|
+
arekit/contrib/utils/pipelines/text_opinion/filters/distance_based.py,sha256=3Pjq4IJJMT7dYpK266lN66WQJUnQO3P0rG6wcAvJOOA,649
|
|
175
|
+
arekit/contrib/utils/pipelines/text_opinion/filters/entity_based.py,sha256=pdWFJaKh4kKIsUuBNp3WNy5Rj80CjWEy2wp-0axFnrI,1254
|
|
176
|
+
arekit/contrib/utils/pipelines/text_opinion/filters/limitation.py,sha256=4AFS5zhocJuYphGO2ZMWmYTtIhGItKDTkB0--AmjgnA,1151
|
|
177
|
+
arekit/contrib/utils/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
178
|
+
arekit/contrib/utils/synonyms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
|
+
arekit/contrib/utils/synonyms/simple.py,sha256=ST9EwuWP88FzbyV8Gi0-biTPgGOsZ7OWyaBWHL_U_eo,557
|
|
180
|
+
arekit/contrib/utils/synonyms/stemmer_based.py,sha256=q19P_XOCWN2_JrBtybAt7ToMIr1ambw4ahr0fSEEHmQ,1400
|
|
181
|
+
arekit-0.25.1.data/data/logo.png,sha256=S8OZ4MGGD72Pf5co7ngYbXKkJH1EUhbErUXv1ZjUWiU,45718
|
|
182
|
+
arekit-0.25.1.dist-info/LICENSE,sha256=JO9tIbxAvhwDv73cX-gUStr9yA-TY7wusUeLHRx7JuY,1076
|
|
183
|
+
arekit-0.25.1.dist-info/METADATA,sha256=ryWGTL4fYqR36z2qh1UuYBg6UIU6n7_U9Y09KPRS6xk,3177
|
|
184
|
+
arekit-0.25.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
185
|
+
arekit-0.25.1.dist-info/top_level.txt,sha256=4pXuFE8IE0lBsqi6ZsR7figx0H939VIX4_-76YIbkOQ,7
|
|
186
|
+
arekit-0.25.1.dist-info/RECORD,,
|
|
File without changes
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
from arekit.common.data.input.providers.columns.base import BaseColumnsProvider
|
|
2
|
-
from arekit.common.data.input.providers.contents import ContentsProvider
|
|
3
|
-
from arekit.common.data.input.providers.rows.base import BaseRowProvider
|
|
4
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
5
|
-
from arekit.contrib.utils.data.storages.row_cache import RowCacheStorage
|
|
6
|
-
from arekit.contrib.utils.data.writers.base import BaseWriter
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BaseInputRepository(object):
|
|
10
|
-
|
|
11
|
-
def __init__(self, columns_provider, rows_provider, storage):
|
|
12
|
-
assert(isinstance(columns_provider, BaseColumnsProvider))
|
|
13
|
-
assert(isinstance(rows_provider, BaseRowProvider))
|
|
14
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
15
|
-
|
|
16
|
-
self._columns_provider = columns_provider
|
|
17
|
-
self._rows_provider = rows_provider
|
|
18
|
-
self._storage = storage
|
|
19
|
-
|
|
20
|
-
# Do setup operations.
|
|
21
|
-
self._setup_columns_provider()
|
|
22
|
-
self._setup_rows_provider()
|
|
23
|
-
|
|
24
|
-
# region protected methods
|
|
25
|
-
|
|
26
|
-
def _setup_columns_provider(self):
|
|
27
|
-
pass
|
|
28
|
-
|
|
29
|
-
def _setup_rows_provider(self):
|
|
30
|
-
pass
|
|
31
|
-
|
|
32
|
-
# endregion
|
|
33
|
-
|
|
34
|
-
def populate(self, contents_provider, doc_ids, desc="", writer=None, target=None):
|
|
35
|
-
assert(isinstance(contents_provider, ContentsProvider))
|
|
36
|
-
assert(isinstance(self._storage, BaseRowsStorage))
|
|
37
|
-
assert(isinstance(doc_ids, list))
|
|
38
|
-
assert(isinstance(writer, BaseWriter) or writer is None)
|
|
39
|
-
assert(isinstance(target, str) or target is None)
|
|
40
|
-
|
|
41
|
-
def iter_rows(idle_mode):
|
|
42
|
-
return self._rows_provider.iter_by_rows(
|
|
43
|
-
contents_provider=contents_provider,
|
|
44
|
-
doc_ids_iter=doc_ids,
|
|
45
|
-
idle_mode=idle_mode)
|
|
46
|
-
|
|
47
|
-
self._storage.init_empty(columns_provider=self._columns_provider)
|
|
48
|
-
|
|
49
|
-
is_async_write_mode_on = writer is not None and target is not None
|
|
50
|
-
|
|
51
|
-
if is_async_write_mode_on:
|
|
52
|
-
writer.open_target(target)
|
|
53
|
-
|
|
54
|
-
self._storage.fill(lambda idle_mode: iter_rows(idle_mode),
|
|
55
|
-
columns_provider=self._columns_provider,
|
|
56
|
-
row_handler=lambda: writer.commit_line(self._storage) if is_async_write_mode_on else None,
|
|
57
|
-
desc=desc)
|
|
58
|
-
|
|
59
|
-
if is_async_write_mode_on:
|
|
60
|
-
writer.close_target()
|
|
61
|
-
|
|
62
|
-
def push(self, writer, target, free_storage=True):
|
|
63
|
-
if not isinstance(self._storage, RowCacheStorage):
|
|
64
|
-
writer.write_all(self._storage, target)
|
|
65
|
-
|
|
66
|
-
# After writing we free the contents of the storage.
|
|
67
|
-
if free_storage:
|
|
68
|
-
self._storage.free()
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from arekit.common.data.input.providers.rows.samples import BaseSampleRowProvider
|
|
4
|
-
from arekit.common.data.input.repositories.base import BaseInputRepository
|
|
5
|
-
|
|
6
|
-
logger = logging.getLogger(__name__)
|
|
7
|
-
logging.basicConfig(level=logging.INFO)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class BaseInputSamplesRepository(BaseInputRepository):
|
|
11
|
-
|
|
12
|
-
def _setup_rows_provider(self):
|
|
13
|
-
""" Setup store labels.
|
|
14
|
-
"""
|
|
15
|
-
assert(isinstance(self._rows_provider, BaseSampleRowProvider))
|
|
16
|
-
self._rows_provider.set_store_labels(self._columns_provider.StoreLabels)
|
|
17
|
-
|
|
18
|
-
def _setup_columns_provider(self):
|
|
19
|
-
""" Setup text column names.
|
|
20
|
-
"""
|
|
21
|
-
text_column_names = list(self._rows_provider.TextProvider.iter_columns())
|
|
22
|
-
self._columns_provider.set_text_column_names(text_column_names)
|
|
File without changes
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
from arekit.common.data import const
|
|
2
|
-
from arekit.common.data.storages.base import BaseRowsStorage
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
# TODO. This is a particular type of view, and expected to be off the core.
|
|
6
|
-
class LinkedSamplesStorageView(object):
|
|
7
|
-
|
|
8
|
-
def iter_from_storage(self, storage):
|
|
9
|
-
assert(isinstance(storage, BaseRowsStorage))
|
|
10
|
-
undefined = -1
|
|
11
|
-
|
|
12
|
-
linked = []
|
|
13
|
-
current_opinion_id = undefined
|
|
14
|
-
for row_index, opinion_id in enumerate(storage.iter_column_values(const.OPINION_ID)):
|
|
15
|
-
if current_opinion_id != undefined:
|
|
16
|
-
if opinion_id != current_opinion_id:
|
|
17
|
-
yield linked
|
|
18
|
-
linked = []
|
|
19
|
-
current_opinion_id = opinion_id
|
|
20
|
-
else:
|
|
21
|
-
current_opinion_id = opinion_id
|
|
22
|
-
|
|
23
|
-
linked.append(storage.get_row(row_index))
|
|
24
|
-
|
|
25
|
-
if len(linked) > 0:
|
|
26
|
-
yield linked
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from arekit.common.pipeline.items.base import BasePipelineItem
|
|
2
|
-
from arekit.common.text.partitioning.base import BasePartitioning
|
|
3
|
-
from arekit.common.pipeline.context import PipelineContext
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class SentenceObjectsParserPipelineItem(BasePipelineItem):
|
|
7
|
-
|
|
8
|
-
def __init__(self, partitioning):
|
|
9
|
-
assert(isinstance(partitioning, BasePartitioning))
|
|
10
|
-
self.__partitioning = partitioning
|
|
11
|
-
|
|
12
|
-
# region protected
|
|
13
|
-
|
|
14
|
-
def _get_text(self, pipeline_ctx):
|
|
15
|
-
return None
|
|
16
|
-
|
|
17
|
-
def _get_parts_provider_func(self, input_data, pipeline_ctx):
|
|
18
|
-
raise NotImplementedError()
|
|
19
|
-
|
|
20
|
-
# endregion
|
|
21
|
-
|
|
22
|
-
def apply_core(self, input_data, pipeline_ctx):
|
|
23
|
-
assert(isinstance(pipeline_ctx, PipelineContext))
|
|
24
|
-
external_input = self._get_text(pipeline_ctx)
|
|
25
|
-
actual_input = input_data if external_input is None else external_input
|
|
26
|
-
parts_it = self._get_parts_provider_func(input_data=actual_input, pipeline_ctx=pipeline_ctx)
|
|
27
|
-
return self.__partitioning.provide(text=actual_input, parts_it=parts_it)
|
|
28
|
-
|
|
29
|
-
# region base
|
|
30
|
-
|
|
31
|
-
def __enter__(self):
|
|
32
|
-
return self
|
|
33
|
-
|
|
34
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
35
|
-
pass
|
|
36
|
-
|
|
37
|
-
# endregion
|
arekit/common/text/parser.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
from arekit.common.pipeline.base import BasePipeline
|
|
2
|
-
from arekit.common.text.parsed import BaseParsedText
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class BaseTextParser(BasePipeline):
|
|
6
|
-
|
|
7
|
-
def run(self, input_data, params_dict=None, parent_ctx=None):
|
|
8
|
-
output_data = super(BaseTextParser, self).run(input_data=input_data,
|
|
9
|
-
params_dict=params_dict,
|
|
10
|
-
parent_ctx=parent_ctx)
|
|
11
|
-
|
|
12
|
-
return BaseParsedText(terms=output_data)
|
|
File without changes
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
from collections.abc import Iterable
|
|
2
|
-
|
|
3
|
-
from arekit.common.bound import Bound
|
|
4
|
-
from arekit.common.text.partitioning.base import BasePartitioning
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class TermsPartitioning(BasePartitioning):
|
|
8
|
-
""" NOTE: considering that provided parts
|
|
9
|
-
has no intersections between each other
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def provide(self, text, parts_it):
|
|
13
|
-
assert(isinstance(text, list))
|
|
14
|
-
assert(isinstance(parts_it, Iterable))
|
|
15
|
-
|
|
16
|
-
start = 0
|
|
17
|
-
parts = []
|
|
18
|
-
for value, bound in parts_it:
|
|
19
|
-
assert(isinstance(bound, Bound))
|
|
20
|
-
assert(bound.Position >= start)
|
|
21
|
-
|
|
22
|
-
# Release everythig till the current value position.
|
|
23
|
-
part = text[start:bound.Position]
|
|
24
|
-
|
|
25
|
-
parts.extend(part)
|
|
26
|
-
|
|
27
|
-
# Release the entity value.
|
|
28
|
-
parts.extend([value])
|
|
29
|
-
|
|
30
|
-
start = bound.Position + bound.Length
|
|
31
|
-
|
|
32
|
-
# Release everything after the last entity.
|
|
33
|
-
parts.extend(text[start:len(text)])
|
|
34
|
-
|
|
35
|
-
return parts
|
|
File without changes
|