pyNlple 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pynlple-0.0.0/MANIFEST.in +2 -0
- pynlple-0.0.0/PKG-INFO +47 -0
- pynlple-0.0.0/README.md +33 -0
- pynlple-0.0.0/pyNlple.egg-info/PKG-INFO +47 -0
- pynlple-0.0.0/pyNlple.egg-info/SOURCES.txt +48 -0
- pynlple-0.0.0/pyNlple.egg-info/dependency_links.txt +1 -0
- pynlple-0.0.0/pyNlple.egg-info/requires.txt +5 -0
- pynlple-0.0.0/pyNlple.egg-info/top_level.txt +1 -0
- pynlple-0.0.0/pynlple/__init__.py +1 -0
- pynlple-0.0.0/pynlple/data/__init__.py +1 -0
- pynlple-0.0.0/pynlple/data/corpus.py +184 -0
- pynlple-0.0.0/pynlple/data/datasource.py +179 -0
- pynlple-0.0.0/pynlple/data/filesource.py +33 -0
- pynlple-0.0.0/pynlple/data/jsonsource.py +179 -0
- pynlple-0.0.0/pynlple/data/source.py +226 -0
- pynlple-0.0.0/pynlple/exceptions.py +16 -0
- pynlple-0.0.0/pynlple/lime.py +47 -0
- pynlple-0.0.0/pynlple/ml/__init__.py +0 -0
- pynlple-0.0.0/pynlple/ml/classifiers.py +66 -0
- pynlple-0.0.0/pynlple/ml/supervised/__init__.py +0 -0
- pynlple-0.0.0/pynlple/ml/supervised/feature_exploration.py +85 -0
- pynlple-0.0.0/pynlple/ml/supervised/training.py +73 -0
- pynlple-0.0.0/pynlple/ml/transformers.py +680 -0
- pynlple-0.0.0/pynlple/ml/vectorizers.py +408 -0
- pynlple-0.0.0/pynlple/module.py +86 -0
- pynlple-0.0.0/pynlple/processing/__init__.py +1 -0
- pynlple-0.0.0/pynlple/processing/data/currency.txt +38 -0
- pynlple-0.0.0/pynlple/processing/data/emoji/emoji_components.txt +1295 -0
- pynlple-0.0.0/pynlple/processing/data/emoji/emojis.txt +2790 -0
- pynlple-0.0.0/pynlple/processing/data/emoji/emojis_clustered.txt +2790 -0
- pynlple-0.0.0/pynlple/processing/data/punctuation.txt +23 -0
- pynlple-0.0.0/pynlple/processing/data/rus/pos/conjunctions.txt +6 -0
- pynlple-0.0.0/pynlple/processing/data/rus/pos/particles.txt +8 -0
- pynlple-0.0.0/pynlple/processing/data/rus/pos/prepositions.txt +33 -0
- pynlple-0.0.0/pynlple/processing/data/rus/pos/pronouns.txt +70 -0
- pynlple-0.0.0/pynlple/processing/data/special_symbols.txt +29 -0
- pynlple-0.0.0/pynlple/processing/data/special_tags.txt +7 -0
- pynlple-0.0.0/pynlple/processing/data/ukr/stopwords.txt +166 -0
- pynlple-0.0.0/pynlple/processing/dictionary.py +318 -0
- pynlple-0.0.0/pynlple/processing/emojis.py +80 -0
- pynlple-0.0.0/pynlple/processing/mention.py +104 -0
- pynlple-0.0.0/pynlple/processing/preprocessor.py +933 -0
- pynlple-0.0.0/pynlple/processing/stopwords.py +139 -0
- pynlple-0.0.0/pynlple/processing/text.py +140 -0
- pynlple-0.0.0/pynlple/processing/token.py +176 -0
- pynlple-0.0.0/pynlple/utils.py +168 -0
- pynlple-0.0.0/pyproject.toml +25 -0
- pynlple-0.0.0/requirements.txt +6 -0
- pynlple-0.0.0/setup.cfg +4 -0
- pynlple-0.0.0/tests/test_utils.py +367 -0
pynlple-0.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pyNlple
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: NLP procedures in python brought to you by YouScan.
|
|
5
|
+
Author-email: "YouScan Data Science team, NLP guild." <ds@youscan.io>
|
|
6
|
+
License: MIT Licence
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: emoji>=0.5.0
|
|
10
|
+
Requires-Dist: numpy>=1.18
|
|
11
|
+
Requires-Dist: pandas>=0.19.0
|
|
12
|
+
Requires-Dist: requests>=2.32.3
|
|
13
|
+
Requires-Dist: scikit-learn>=0.22
|
|
14
|
+
|
|
15
|
+
Overview
|
|
16
|
+
========
|
|
17
|
+
pyNlple - a library with basic NLP utils. Also includes methods and APIs for data accessing/writing.
|
|
18
|
+
|
|
19
|
+
Requirements
|
|
20
|
+
============
|
|
21
|
+
|
|
22
|
+
Cat `requirements.txt`
|
|
23
|
+
|
|
24
|
+
Installation
|
|
25
|
+
============
|
|
26
|
+
|
|
27
|
+
Run setup.py to install / `pip install pynlple`
|
|
28
|
+
|
|
29
|
+
Notes
|
|
30
|
+
============
|
|
31
|
+
- name inspired by https://www.youtube.com/watch?v=1W3sslyiUfg
|
|
32
|
+
- yeah, we had "PYthon" and "NLP" procedures, so...
|
|
33
|
+
- and yeah, try to read it as #pineapple http://www.veepy.com/wp-content/uploads/2015/05/1965.jpg
|
|
34
|
+
|
|
35
|
+
## Licenses
|
|
36
|
+
|
|
37
|
+
Install from https://pypi.org/project/pip-licenses/
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
pip install pip-licenses
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Run
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
make licenses
|
|
47
|
+
```
|
pynlple-0.0.0/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Overview
|
|
2
|
+
========
|
|
3
|
+
pyNlple - a library with basic NLP utils. Also includes methods and APIs for data accessing/writing.
|
|
4
|
+
|
|
5
|
+
Requirements
|
|
6
|
+
============
|
|
7
|
+
|
|
8
|
+
Cat `requirements.txt`
|
|
9
|
+
|
|
10
|
+
Installation
|
|
11
|
+
============
|
|
12
|
+
|
|
13
|
+
Run setup.py to install / `pip install pynlple`
|
|
14
|
+
|
|
15
|
+
Notes
|
|
16
|
+
============
|
|
17
|
+
- name inspired by https://www.youtube.com/watch?v=1W3sslyiUfg
|
|
18
|
+
- yeah, we had "PYthon" and "NLP" procedures, so...
|
|
19
|
+
- and yeah, try to read it as #pineapple http://www.veepy.com/wp-content/uploads/2015/05/1965.jpg
|
|
20
|
+
|
|
21
|
+
## Licenses
|
|
22
|
+
|
|
23
|
+
Install from https://pypi.org/project/pip-licenses/
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
pip install pip-licenses
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Run
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
make licenses
|
|
33
|
+
```
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pyNlple
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: NLP procedures in python brought to you by YouScan.
|
|
5
|
+
Author-email: "YouScan Data Science team, NLP guild." <ds@youscan.io>
|
|
6
|
+
License: MIT Licence
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: emoji>=0.5.0
|
|
10
|
+
Requires-Dist: numpy>=1.18
|
|
11
|
+
Requires-Dist: pandas>=0.19.0
|
|
12
|
+
Requires-Dist: requests>=2.32.3
|
|
13
|
+
Requires-Dist: scikit-learn>=0.22
|
|
14
|
+
|
|
15
|
+
Overview
|
|
16
|
+
========
|
|
17
|
+
pyNlple - a library with basic NLP utils. Also includes methods and APIs for data accessing/writing.
|
|
18
|
+
|
|
19
|
+
Requirements
|
|
20
|
+
============
|
|
21
|
+
|
|
22
|
+
Cat `requirements.txt`
|
|
23
|
+
|
|
24
|
+
Installation
|
|
25
|
+
============
|
|
26
|
+
|
|
27
|
+
Run setup.py to install / `pip install pynlple`
|
|
28
|
+
|
|
29
|
+
Notes
|
|
30
|
+
============
|
|
31
|
+
- name inspired by https://www.youtube.com/watch?v=1W3sslyiUfg
|
|
32
|
+
- yeah, we had "PYthon" and "NLP" procedures, so...
|
|
33
|
+
- and yeah, try to read it as #pineapple http://www.veepy.com/wp-content/uploads/2015/05/1965.jpg
|
|
34
|
+
|
|
35
|
+
## Licenses
|
|
36
|
+
|
|
37
|
+
Install from https://pypi.org/project/pip-licenses/
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
pip install pip-licenses
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Run
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
make licenses
|
|
47
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
MANIFEST.in
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
requirements.txt
|
|
5
|
+
pyNlple.egg-info/PKG-INFO
|
|
6
|
+
pyNlple.egg-info/SOURCES.txt
|
|
7
|
+
pyNlple.egg-info/dependency_links.txt
|
|
8
|
+
pyNlple.egg-info/requires.txt
|
|
9
|
+
pyNlple.egg-info/top_level.txt
|
|
10
|
+
pynlple/__init__.py
|
|
11
|
+
pynlple/exceptions.py
|
|
12
|
+
pynlple/lime.py
|
|
13
|
+
pynlple/module.py
|
|
14
|
+
pynlple/utils.py
|
|
15
|
+
pynlple/data/__init__.py
|
|
16
|
+
pynlple/data/corpus.py
|
|
17
|
+
pynlple/data/datasource.py
|
|
18
|
+
pynlple/data/filesource.py
|
|
19
|
+
pynlple/data/jsonsource.py
|
|
20
|
+
pynlple/data/source.py
|
|
21
|
+
pynlple/ml/__init__.py
|
|
22
|
+
pynlple/ml/classifiers.py
|
|
23
|
+
pynlple/ml/transformers.py
|
|
24
|
+
pynlple/ml/vectorizers.py
|
|
25
|
+
pynlple/ml/supervised/__init__.py
|
|
26
|
+
pynlple/ml/supervised/feature_exploration.py
|
|
27
|
+
pynlple/ml/supervised/training.py
|
|
28
|
+
pynlple/processing/__init__.py
|
|
29
|
+
pynlple/processing/dictionary.py
|
|
30
|
+
pynlple/processing/emojis.py
|
|
31
|
+
pynlple/processing/mention.py
|
|
32
|
+
pynlple/processing/preprocessor.py
|
|
33
|
+
pynlple/processing/stopwords.py
|
|
34
|
+
pynlple/processing/text.py
|
|
35
|
+
pynlple/processing/token.py
|
|
36
|
+
pynlple/processing/data/currency.txt
|
|
37
|
+
pynlple/processing/data/punctuation.txt
|
|
38
|
+
pynlple/processing/data/special_symbols.txt
|
|
39
|
+
pynlple/processing/data/special_tags.txt
|
|
40
|
+
pynlple/processing/data/emoji/emoji_components.txt
|
|
41
|
+
pynlple/processing/data/emoji/emojis.txt
|
|
42
|
+
pynlple/processing/data/emoji/emojis_clustered.txt
|
|
43
|
+
pynlple/processing/data/rus/pos/conjunctions.txt
|
|
44
|
+
pynlple/processing/data/rus/pos/particles.txt
|
|
45
|
+
pynlple/processing/data/rus/pos/prepositions.txt
|
|
46
|
+
pynlple/processing/data/rus/pos/pronouns.txt
|
|
47
|
+
pynlple/processing/data/ukr/stopwords.txt
|
|
48
|
+
tests/test_utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pynlple
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.10.1'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import bz2
|
|
3
|
+
import gzip
|
|
4
|
+
import io
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from pandas import Series
|
|
8
|
+
|
|
9
|
+
from pynlple.data.source import Source
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class StackingSource(Source):
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
def __init__(self, list_sources, log=False):
|
|
16
|
+
self.sources = list_sources
|
|
17
|
+
self.log = log
|
|
18
|
+
|
|
19
|
+
def __iter__(self):
|
|
20
|
+
# if self.log:
|
|
21
|
+
# self.logger.info('[%s] Corpus iterator %s started yielding elements.', str(self.__class__.__name__), repr(self))
|
|
22
|
+
for i_, source in enumerate(self.sources):
|
|
23
|
+
if self.log and i_ % self.log == 0:
|
|
24
|
+
self.logger.info('[%s] Corpus iterator started yielding elements from source (%d/%d): %s.',
|
|
25
|
+
str(self.__class__.__name__), i_, len(self.sources), repr(source))
|
|
26
|
+
for item in source:
|
|
27
|
+
yield item
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DeduplicatingFixedCacheSizeSource(Source):
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
def __init__(self, source, ordered_dict_cache=None, cache_size=10000, refresh=False, feature_extractor=lambda x: x,
|
|
34
|
+
log=100000):
|
|
35
|
+
self.source = source
|
|
36
|
+
self.ordered_dict = ordered_dict_cache
|
|
37
|
+
self.cache_size = cache_size
|
|
38
|
+
self.refresh = refresh
|
|
39
|
+
self.feature_extractor = feature_extractor
|
|
40
|
+
self.log = log
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def __prop_dict(dict_):
|
|
44
|
+
for k, v in dict_.items():
|
|
45
|
+
dict_[k] = v + 1
|
|
46
|
+
|
|
47
|
+
def __iter__(self):
|
|
48
|
+
if self.ordered_dict is None:
|
|
49
|
+
from collections import OrderedDict
|
|
50
|
+
self.__cache = OrderedDict()
|
|
51
|
+
else:
|
|
52
|
+
self.__cache = self.ordered_dict
|
|
53
|
+
for i, entry in enumerate(self.source):
|
|
54
|
+
if self.log and i % self.log == 0:
|
|
55
|
+
stats = Series(list(self.__cache.values()))
|
|
56
|
+
self.logger.info('[%s] Lifetime stats on iter %d: %s',
|
|
57
|
+
str(self.__class__.__name__), i, repr(stats.describe()))
|
|
58
|
+
|
|
59
|
+
f_entry = self.feature_extractor(entry)
|
|
60
|
+
if f_entry in self.__cache:
|
|
61
|
+
self.__prop_dict(self.__cache)
|
|
62
|
+
if self.refresh:
|
|
63
|
+
self.__cache.move_to_end(f_entry)
|
|
64
|
+
else:
|
|
65
|
+
self.__cache.__setitem__(f_entry, 0)
|
|
66
|
+
if self.cache_size is not None:
|
|
67
|
+
while len(self.__cache) > self.cache_size:
|
|
68
|
+
self.__cache.popitem(last=False)
|
|
69
|
+
yield entry
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class JsonFieldSource(Source):
|
|
73
|
+
|
|
74
|
+
def __init__(self, json_source, key, default=None):
|
|
75
|
+
self.json = json_source
|
|
76
|
+
self.key = key
|
|
77
|
+
self.default = default
|
|
78
|
+
|
|
79
|
+
def __iter__(self):
|
|
80
|
+
for json_entry in self.json:
|
|
81
|
+
if self.key in json_entry:
|
|
82
|
+
yield json_entry[self.key]
|
|
83
|
+
else:
|
|
84
|
+
yield self.default
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class FilteringSource(Source):
|
|
88
|
+
|
|
89
|
+
def __init__(self, source, condition):
|
|
90
|
+
self.source = source
|
|
91
|
+
self.condition = condition
|
|
92
|
+
|
|
93
|
+
def __iter__(self):
|
|
94
|
+
for entry in self.source:
|
|
95
|
+
if self.condition(entry):
|
|
96
|
+
yield entry
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class MappingSource(Source):
|
|
100
|
+
|
|
101
|
+
def __init__(self, source, function):
|
|
102
|
+
self.source = source
|
|
103
|
+
self.function = function
|
|
104
|
+
|
|
105
|
+
def __iter__(self):
|
|
106
|
+
for entry in self.source:
|
|
107
|
+
yield self.function(entry)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class SplittingSource(Source):
|
|
111
|
+
|
|
112
|
+
def __init__(self, source, splitting_function):
|
|
113
|
+
self.source = source
|
|
114
|
+
self.function = splitting_function
|
|
115
|
+
|
|
116
|
+
def __iter__(self):
|
|
117
|
+
for entry in self.source:
|
|
118
|
+
for item in self.function(entry):
|
|
119
|
+
yield item
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class FileLineSource(Source):
|
|
123
|
+
|
|
124
|
+
def __init__(self, text_file_path, encoding='utf8'):
|
|
125
|
+
self.source_file = text_file_path
|
|
126
|
+
self.encoding = encoding
|
|
127
|
+
|
|
128
|
+
def __iter__(self):
|
|
129
|
+
with io.open(self.source_file, mode='rt', encoding=self.encoding) as in_file:
|
|
130
|
+
for line in in_file:
|
|
131
|
+
line = line.strip()
|
|
132
|
+
if len(line) <= 0:
|
|
133
|
+
continue
|
|
134
|
+
yield line
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class OpensubtitlesSentenceSource(Source):
|
|
138
|
+
DEFAULT_SENTENCE_TAG = '<s>'
|
|
139
|
+
|
|
140
|
+
def __init__(self, line_source, sentence_tag=None):
|
|
141
|
+
self.source = line_source
|
|
142
|
+
if sentence_tag:
|
|
143
|
+
self.sentence_tag = sentence_tag
|
|
144
|
+
else:
|
|
145
|
+
self.sentence_tag = OpensubtitlesSentenceSource.DEFAULT_SENTENCE_TAG
|
|
146
|
+
|
|
147
|
+
def __iter__(self):
|
|
148
|
+
for line in self.source:
|
|
149
|
+
for sentence in line.split(self.sentence_tag):
|
|
150
|
+
yield sentence.strip()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class BZipDocumentSource(Source):
|
|
154
|
+
|
|
155
|
+
def __init__(self, bzip_filepath, text_preprocessor=None):
|
|
156
|
+
self.source_filepath = bzip_filepath
|
|
157
|
+
self.text_preprocessor = text_preprocessor
|
|
158
|
+
super().__init__()
|
|
159
|
+
|
|
160
|
+
def __iter__(self):
|
|
161
|
+
with bz2.BZ2File(self.source_filepath, 'rtU') as in_bz:
|
|
162
|
+
for line in in_bz:
|
|
163
|
+
text = line
|
|
164
|
+
if self.text_preprocessor:
|
|
165
|
+
text = self.text_preprocessor.preprocess(text)
|
|
166
|
+
tokens = text.split()
|
|
167
|
+
yield tokens
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class GZipDocumentSource(Source):
|
|
171
|
+
|
|
172
|
+
def __init__(self, gzip_filepath, text_preprocessor=None):
|
|
173
|
+
self.source_filepath = gzip_filepath
|
|
174
|
+
self.text_preprocessor = text_preprocessor
|
|
175
|
+
super().__init__()
|
|
176
|
+
|
|
177
|
+
def __iter__(self):
|
|
178
|
+
with gzip.GzipFile(self.source_filepath, 'rU') as in_bz:
|
|
179
|
+
for line in in_bz:
|
|
180
|
+
text = line
|
|
181
|
+
if self.text_preprocessor:
|
|
182
|
+
text = self.text_preprocessor.preprocess(text)
|
|
183
|
+
tokens = text.split()
|
|
184
|
+
yield tokens
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from pandas import DataFrame
|
|
7
|
+
from pandas import read_csv, read_json
|
|
8
|
+
|
|
9
|
+
from pynlple.data.jsonsource import FileJsonDataSource
|
|
10
|
+
from pynlple.data.source import Source
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__file__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DataframeSource(Source):
|
|
16
|
+
|
|
17
|
+
def __init__(self, dataframe):
|
|
18
|
+
self.dataframe = dataframe
|
|
19
|
+
|
|
20
|
+
def get_dataframe(self):
|
|
21
|
+
return self.dataframe
|
|
22
|
+
|
|
23
|
+
def set_dataframe(self, dataframe):
|
|
24
|
+
self.dataframe = dataframe
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TsvDataframeSource(Source):
|
|
28
|
+
|
|
29
|
+
def __init__(self, dataframe_path, separator='\t', quote=0, escape_char='\\', column_names=None,
|
|
30
|
+
index_column_names=None, fill_na_map=None, encoding='utf-8', index_columns=None):
|
|
31
|
+
self.path = dataframe_path
|
|
32
|
+
self.separator = separator
|
|
33
|
+
self.column_names = column_names
|
|
34
|
+
self.na_map = fill_na_map
|
|
35
|
+
self.encoding = encoding
|
|
36
|
+
self.index_columns = index_columns
|
|
37
|
+
self.index_column_names = index_column_names
|
|
38
|
+
self.quote = quote
|
|
39
|
+
self.escape_char = escape_char
|
|
40
|
+
|
|
41
|
+
def get_dataframe(self):
|
|
42
|
+
# TODO: Eats \r\n and spits sole \n in literal value strings instead
|
|
43
|
+
if self.column_names:
|
|
44
|
+
header = None
|
|
45
|
+
names = self.column_names
|
|
46
|
+
else:
|
|
47
|
+
header = 'infer'
|
|
48
|
+
names = None
|
|
49
|
+
|
|
50
|
+
dataframe = read_csv(self.path,
|
|
51
|
+
sep=self.separator,
|
|
52
|
+
header=header,
|
|
53
|
+
names=names,
|
|
54
|
+
quoting=self.quote,
|
|
55
|
+
escapechar=self.escape_char,
|
|
56
|
+
encoding=self.encoding)
|
|
57
|
+
if self.index_columns:
|
|
58
|
+
dataframe.set_index(keys=self.index_columns, inplace=True)
|
|
59
|
+
if self.na_map:
|
|
60
|
+
for key, value in self.na_map.items():
|
|
61
|
+
dataframe[key].fillna(value, inplace=True)
|
|
62
|
+
logger.debug('Read: {} rows from {}'.format(str(len(dataframe.index)), self.path))
|
|
63
|
+
return dataframe
|
|
64
|
+
|
|
65
|
+
def set_dataframe(self, dataframe):
|
|
66
|
+
if self.column_names:
|
|
67
|
+
names = self.column_names
|
|
68
|
+
else:
|
|
69
|
+
names = True
|
|
70
|
+
|
|
71
|
+
if self.index_column_names:
|
|
72
|
+
include_index = True
|
|
73
|
+
index_names = self.index_column_names
|
|
74
|
+
else:
|
|
75
|
+
include_index = False
|
|
76
|
+
index_names = None
|
|
77
|
+
|
|
78
|
+
dataframe.to_csv(self.path,
|
|
79
|
+
sep=self.separator,
|
|
80
|
+
header=names,
|
|
81
|
+
index=include_index,
|
|
82
|
+
index_label=index_names,
|
|
83
|
+
quoting=self.quote,
|
|
84
|
+
escapechar=self.escape_char,
|
|
85
|
+
encoding=self.encoding)
|
|
86
|
+
logger.debug('Written: {} rows from {}'.format(str(len(dataframe.index)), self.path))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class JsonFileDataframeSource(Source):
|
|
90
|
+
FILE_READ_METHOD = 'rt'
|
|
91
|
+
FILE_WRITE_METHOD = 'wt'
|
|
92
|
+
DEFAULT_ORIENT = 'records'
|
|
93
|
+
DEFAULT_ENCODING = 'utf-8'
|
|
94
|
+
|
|
95
|
+
def __init__(self, json_file_path, fill_na_map=None, index_columns=None,
|
|
96
|
+
encoding=DEFAULT_ENCODING, orient=DEFAULT_ORIENT):
|
|
97
|
+
self.json_file_path = json_file_path
|
|
98
|
+
self.na_map = fill_na_map
|
|
99
|
+
self.index_columns = index_columns
|
|
100
|
+
self.encoding = encoding
|
|
101
|
+
self.orient = orient
|
|
102
|
+
|
|
103
|
+
def get_dataframe(self):
|
|
104
|
+
with io.open(self.json_file_path, JsonFileDataframeSource.FILE_READ_METHOD,
|
|
105
|
+
encoding=self.encoding) as data_file:
|
|
106
|
+
# TODO: implement fill_na_map
|
|
107
|
+
df = read_json(data_file, orient=self.orient, encoding=JsonFileDataframeSource.DEFAULT_ENCODING)
|
|
108
|
+
return df
|
|
109
|
+
|
|
110
|
+
def set_dataframe(self, dataframe):
|
|
111
|
+
with io.open(self.json_file_path, JsonFileDataframeSource.FILE_WRITE_METHOD,
|
|
112
|
+
encoding=self.encoding) as data_file:
|
|
113
|
+
# TODO: implement fill_na_map
|
|
114
|
+
json.dump(dataframe.reset_index().to_dict(orient=self.orient), data_file, ensure_ascii=False, indent=1)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class JsonNullableFileDataframeSource(Source):
|
|
118
|
+
DEFAULT_ENCODING = 'utf-8'
|
|
119
|
+
|
|
120
|
+
def __init__(self, json_file_path, keys=None, fill_na_map=None, index_columns=None,
|
|
121
|
+
encoding=DEFAULT_ENCODING):
|
|
122
|
+
self.__source = JsonDataframeSource(FileJsonDataSource(file_path=json_file_path, encoding_str=encoding),
|
|
123
|
+
keys=keys, fill_na_map=fill_na_map,
|
|
124
|
+
index_columns=index_columns)
|
|
125
|
+
|
|
126
|
+
def get_dataframe(self):
|
|
127
|
+
return self.__source.get_dataframe()
|
|
128
|
+
|
|
129
|
+
def set_dataframe(self, dataframe):
|
|
130
|
+
self.__source.set_dataframe(dataframe)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class JsonDataframeSource(Source):
|
|
134
|
+
|
|
135
|
+
def __init__(self, json_source, keys=None, fill_na_map=None, index_columns=None):
|
|
136
|
+
self.json_source = json_source
|
|
137
|
+
self.keys = keys
|
|
138
|
+
self.na_map = fill_na_map
|
|
139
|
+
self.index_columns = index_columns
|
|
140
|
+
|
|
141
|
+
def get_dataframe(self):
|
|
142
|
+
extracted_entries = list()
|
|
143
|
+
for json_object in self.json_source.get_data():
|
|
144
|
+
entry = dict()
|
|
145
|
+
if self.keys:
|
|
146
|
+
for key in self.keys:
|
|
147
|
+
if key not in json_object:
|
|
148
|
+
entry[key] = self.na_map[key]
|
|
149
|
+
else:
|
|
150
|
+
entry[key] = json_object[key]
|
|
151
|
+
else:
|
|
152
|
+
for key in json_object:
|
|
153
|
+
entry[key] = json_object[key]
|
|
154
|
+
if self.na_map:
|
|
155
|
+
for key, value in self.na_map:
|
|
156
|
+
if key not in entry:
|
|
157
|
+
entry[key] = value
|
|
158
|
+
extracted_entries.append(entry)
|
|
159
|
+
dataframe = DataFrame(extracted_entries)
|
|
160
|
+
if self.index_columns:
|
|
161
|
+
dataframe.set_index(keys=self.index_columns, inplace=True)
|
|
162
|
+
if self.na_map:
|
|
163
|
+
for key, value in self.na_map.items():
|
|
164
|
+
dataframe.loc[:, key].fillna(value, inplace=True)
|
|
165
|
+
logger.debug('Read: {} rows from {}'.format(str(len(dataframe.index)), repr(self.json_source)))
|
|
166
|
+
return dataframe
|
|
167
|
+
|
|
168
|
+
def set_dataframe(self, dataframe):
|
|
169
|
+
entries = dataframe.reset_index().to_dict(orient='records')
|
|
170
|
+
for entry in entries:
|
|
171
|
+
if self.keys:
|
|
172
|
+
for key in list(entry.keys()):
|
|
173
|
+
if key not in self.keys:
|
|
174
|
+
entry.pop(key, None)
|
|
175
|
+
if self.na_map:
|
|
176
|
+
for key, value in self.na_map:
|
|
177
|
+
if key not in entry:
|
|
178
|
+
entry[key] = value
|
|
179
|
+
self.json_source.set_data(entries)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from pynlple.data.source import Source
|
|
3
|
+
from pynlple.exceptions import DataSourceException
|
|
4
|
+
from pynlple.module import is_folder, is_file, list_dir, append_paths
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FilePathSource(Source):
|
|
8
|
+
"""Class for providing filepaths from data folders."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, paths, extension_suffix=None):
|
|
11
|
+
self.paths = paths
|
|
12
|
+
self.extension = extension_suffix
|
|
13
|
+
|
|
14
|
+
def get_files(self):
|
|
15
|
+
accumulated_paths = list()
|
|
16
|
+
for path in self.paths:
|
|
17
|
+
if is_file(path):
|
|
18
|
+
accumulated_paths.append(path)
|
|
19
|
+
elif is_folder(path):
|
|
20
|
+
files = list_dir(path)
|
|
21
|
+
if self.extension:
|
|
22
|
+
for file in filter(lambda f: f.endswith(self.extension), files):
|
|
23
|
+
accumulated_paths.append(append_paths(path, file))
|
|
24
|
+
else:
|
|
25
|
+
for file in files:
|
|
26
|
+
accumulated_paths.append(append_paths(path, file))
|
|
27
|
+
else:
|
|
28
|
+
raise DataSourceException('Path {0} does not exist/is neither file nor folder!'.format(path))
|
|
29
|
+
return accumulated_paths
|
|
30
|
+
|
|
31
|
+
def __iter__(self):
|
|
32
|
+
for file_ in self.get_files():
|
|
33
|
+
yield file_
|