pyNlple 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. pynlple-0.0.0/MANIFEST.in +2 -0
  2. pynlple-0.0.0/PKG-INFO +47 -0
  3. pynlple-0.0.0/README.md +33 -0
  4. pynlple-0.0.0/pyNlple.egg-info/PKG-INFO +47 -0
  5. pynlple-0.0.0/pyNlple.egg-info/SOURCES.txt +48 -0
  6. pynlple-0.0.0/pyNlple.egg-info/dependency_links.txt +1 -0
  7. pynlple-0.0.0/pyNlple.egg-info/requires.txt +5 -0
  8. pynlple-0.0.0/pyNlple.egg-info/top_level.txt +1 -0
  9. pynlple-0.0.0/pynlple/__init__.py +1 -0
  10. pynlple-0.0.0/pynlple/data/__init__.py +1 -0
  11. pynlple-0.0.0/pynlple/data/corpus.py +184 -0
  12. pynlple-0.0.0/pynlple/data/datasource.py +179 -0
  13. pynlple-0.0.0/pynlple/data/filesource.py +33 -0
  14. pynlple-0.0.0/pynlple/data/jsonsource.py +179 -0
  15. pynlple-0.0.0/pynlple/data/source.py +226 -0
  16. pynlple-0.0.0/pynlple/exceptions.py +16 -0
  17. pynlple-0.0.0/pynlple/lime.py +47 -0
  18. pynlple-0.0.0/pynlple/ml/__init__.py +0 -0
  19. pynlple-0.0.0/pynlple/ml/classifiers.py +66 -0
  20. pynlple-0.0.0/pynlple/ml/supervised/__init__.py +0 -0
  21. pynlple-0.0.0/pynlple/ml/supervised/feature_exploration.py +85 -0
  22. pynlple-0.0.0/pynlple/ml/supervised/training.py +73 -0
  23. pynlple-0.0.0/pynlple/ml/transformers.py +680 -0
  24. pynlple-0.0.0/pynlple/ml/vectorizers.py +408 -0
  25. pynlple-0.0.0/pynlple/module.py +86 -0
  26. pynlple-0.0.0/pynlple/processing/__init__.py +1 -0
  27. pynlple-0.0.0/pynlple/processing/data/currency.txt +38 -0
  28. pynlple-0.0.0/pynlple/processing/data/emoji/emoji_components.txt +1295 -0
  29. pynlple-0.0.0/pynlple/processing/data/emoji/emojis.txt +2790 -0
  30. pynlple-0.0.0/pynlple/processing/data/emoji/emojis_clustered.txt +2790 -0
  31. pynlple-0.0.0/pynlple/processing/data/punctuation.txt +23 -0
  32. pynlple-0.0.0/pynlple/processing/data/rus/pos/conjunctions.txt +6 -0
  33. pynlple-0.0.0/pynlple/processing/data/rus/pos/particles.txt +8 -0
  34. pynlple-0.0.0/pynlple/processing/data/rus/pos/prepositions.txt +33 -0
  35. pynlple-0.0.0/pynlple/processing/data/rus/pos/pronouns.txt +70 -0
  36. pynlple-0.0.0/pynlple/processing/data/special_symbols.txt +29 -0
  37. pynlple-0.0.0/pynlple/processing/data/special_tags.txt +7 -0
  38. pynlple-0.0.0/pynlple/processing/data/ukr/stopwords.txt +166 -0
  39. pynlple-0.0.0/pynlple/processing/dictionary.py +318 -0
  40. pynlple-0.0.0/pynlple/processing/emojis.py +80 -0
  41. pynlple-0.0.0/pynlple/processing/mention.py +104 -0
  42. pynlple-0.0.0/pynlple/processing/preprocessor.py +933 -0
  43. pynlple-0.0.0/pynlple/processing/stopwords.py +139 -0
  44. pynlple-0.0.0/pynlple/processing/text.py +140 -0
  45. pynlple-0.0.0/pynlple/processing/token.py +176 -0
  46. pynlple-0.0.0/pynlple/utils.py +168 -0
  47. pynlple-0.0.0/pyproject.toml +25 -0
  48. pynlple-0.0.0/requirements.txt +6 -0
  49. pynlple-0.0.0/setup.cfg +4 -0
  50. pynlple-0.0.0/tests/test_utils.py +367 -0
@@ -0,0 +1,2 @@
1
+ include requirements.txt
2
+ recursive-include pynlple *.txt
pynlple-0.0.0/PKG-INFO ADDED
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.1
2
+ Name: pyNlple
3
+ Version: 0.0.0
4
+ Summary: NLP procedures in python brought to you by YouScan.
5
+ Author-email: "YouScan Data Science team, NLP guild." <ds@youscan.io>
6
+ License: MIT Licence
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: emoji>=0.5.0
10
+ Requires-Dist: numpy>=1.18
11
+ Requires-Dist: pandas>=0.19.0
12
+ Requires-Dist: requests>=2.32.3
13
+ Requires-Dist: scikit-learn>=0.22
14
+
15
+ Overview
16
+ ========
17
+ pyNlple - a library with basic NLP utils. Also includes methods and APIs for data accessing/writing.
18
+
19
+ Requirements
20
+ ============
21
+
22
+ Cat `requirements.txt`
23
+
24
+ Installation
25
+ ============
26
+
27
+ Run setup.py to install / `pip install pynlple`
28
+
29
+ Notes
30
+ ============
31
+ - name inspired by https://www.youtube.com/watch?v=1W3sslyiUfg
32
+ - yeah, we had "PYthon" and "NLP" procedures, so...
33
+ - and yeah, try to read it as #pineapple http://www.veepy.com/wp-content/uploads/2015/05/1965.jpg
34
+
35
+ ## Licenses
36
+
37
+ Install from https://pypi.org/project/pip-licenses/
38
+
39
+ ```
40
+ pip install pip-licenses
41
+ ```
42
+
43
+ Run
44
+
45
+ ```
46
+ make licenses
47
+ ```
@@ -0,0 +1,33 @@
1
+ Overview
2
+ ========
3
+ pyNlple - a library with basic NLP utils. Also includes methods and APIs for data accessing/writing.
4
+
5
+ Requirements
6
+ ============
7
+
8
+ Cat `requirements.txt`
9
+
10
+ Installation
11
+ ============
12
+
13
+ Run setup.py to install / `pip install pynlple`
14
+
15
+ Notes
16
+ ============
17
+ - name inspired by https://www.youtube.com/watch?v=1W3sslyiUfg
18
+ - yeah, we had "PYthon" and "NLP" procedures, so...
19
+ - and yeah, try to read it as #pineapple http://www.veepy.com/wp-content/uploads/2015/05/1965.jpg
20
+
21
+ ## Licenses
22
+
23
+ Install from https://pypi.org/project/pip-licenses/
24
+
25
+ ```
26
+ pip install pip-licenses
27
+ ```
28
+
29
+ Run
30
+
31
+ ```
32
+ make licenses
33
+ ```
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.1
2
+ Name: pyNlple
3
+ Version: 0.0.0
4
+ Summary: NLP procedures in python brought to you by YouScan.
5
+ Author-email: "YouScan Data Science team, NLP guild." <ds@youscan.io>
6
+ License: MIT Licence
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: emoji>=0.5.0
10
+ Requires-Dist: numpy>=1.18
11
+ Requires-Dist: pandas>=0.19.0
12
+ Requires-Dist: requests>=2.32.3
13
+ Requires-Dist: scikit-learn>=0.22
14
+
15
+ Overview
16
+ ========
17
+ pyNlple - a library with basic NLP utils. Also includes methods and APIs for data accessing/writing.
18
+
19
+ Requirements
20
+ ============
21
+
22
+ Cat `requirements.txt`
23
+
24
+ Installation
25
+ ============
26
+
27
+ Run setup.py to install / `pip install pynlple`
28
+
29
+ Notes
30
+ ============
31
+ - name inspired by https://www.youtube.com/watch?v=1W3sslyiUfg
32
+ - yeah, we had "PYthon" and "NLP" procedures, so...
33
+ - and yeah, try to read it as #pineapple http://www.veepy.com/wp-content/uploads/2015/05/1965.jpg
34
+
35
+ ## Licenses
36
+
37
+ Install from https://pypi.org/project/pip-licenses/
38
+
39
+ ```
40
+ pip install pip-licenses
41
+ ```
42
+
43
+ Run
44
+
45
+ ```
46
+ make licenses
47
+ ```
@@ -0,0 +1,48 @@
1
+ MANIFEST.in
2
+ README.md
3
+ pyproject.toml
4
+ requirements.txt
5
+ pyNlple.egg-info/PKG-INFO
6
+ pyNlple.egg-info/SOURCES.txt
7
+ pyNlple.egg-info/dependency_links.txt
8
+ pyNlple.egg-info/requires.txt
9
+ pyNlple.egg-info/top_level.txt
10
+ pynlple/__init__.py
11
+ pynlple/exceptions.py
12
+ pynlple/lime.py
13
+ pynlple/module.py
14
+ pynlple/utils.py
15
+ pynlple/data/__init__.py
16
+ pynlple/data/corpus.py
17
+ pynlple/data/datasource.py
18
+ pynlple/data/filesource.py
19
+ pynlple/data/jsonsource.py
20
+ pynlple/data/source.py
21
+ pynlple/ml/__init__.py
22
+ pynlple/ml/classifiers.py
23
+ pynlple/ml/transformers.py
24
+ pynlple/ml/vectorizers.py
25
+ pynlple/ml/supervised/__init__.py
26
+ pynlple/ml/supervised/feature_exploration.py
27
+ pynlple/ml/supervised/training.py
28
+ pynlple/processing/__init__.py
29
+ pynlple/processing/dictionary.py
30
+ pynlple/processing/emojis.py
31
+ pynlple/processing/mention.py
32
+ pynlple/processing/preprocessor.py
33
+ pynlple/processing/stopwords.py
34
+ pynlple/processing/text.py
35
+ pynlple/processing/token.py
36
+ pynlple/processing/data/currency.txt
37
+ pynlple/processing/data/punctuation.txt
38
+ pynlple/processing/data/special_symbols.txt
39
+ pynlple/processing/data/special_tags.txt
40
+ pynlple/processing/data/emoji/emoji_components.txt
41
+ pynlple/processing/data/emoji/emojis.txt
42
+ pynlple/processing/data/emoji/emojis_clustered.txt
43
+ pynlple/processing/data/rus/pos/conjunctions.txt
44
+ pynlple/processing/data/rus/pos/particles.txt
45
+ pynlple/processing/data/rus/pos/prepositions.txt
46
+ pynlple/processing/data/rus/pos/pronouns.txt
47
+ pynlple/processing/data/ukr/stopwords.txt
48
+ tests/test_utils.py
@@ -0,0 +1,5 @@
1
+ emoji>=0.5.0
2
+ numpy>=1.18
3
+ pandas>=0.19.0
4
+ requests>=2.32.3
5
+ scikit-learn>=0.22
@@ -0,0 +1 @@
1
+ pynlple
@@ -0,0 +1 @@
1
+ __version__ = '0.10.1'
@@ -0,0 +1 @@
1
+ 
@@ -0,0 +1,184 @@
1
+ # -*- coding: utf-8 -*-
2
+ import bz2
3
+ import gzip
4
+ import io
5
+ import logging
6
+
7
+ from pandas import Series
8
+
9
+ from pynlple.data.source import Source
10
+
11
+
12
+ class StackingSource(Source):
13
+ logger = logging.getLogger(__name__)
14
+
15
+ def __init__(self, list_sources, log=False):
16
+ self.sources = list_sources
17
+ self.log = log
18
+
19
+ def __iter__(self):
20
+ # if self.log:
21
+ # self.logger.info('[%s] Corpus iterator %s started yielding elements.', str(self.__class__.__name__), repr(self))
22
+ for i_, source in enumerate(self.sources):
23
+ if self.log and i_ % self.log == 0:
24
+ self.logger.info('[%s] Corpus iterator started yielding elements from source (%d/%d): %s.',
25
+ str(self.__class__.__name__), i_, len(self.sources), repr(source))
26
+ for item in source:
27
+ yield item
28
+
29
+
30
+ class DeduplicatingFixedCacheSizeSource(Source):
31
+ logger = logging.getLogger(__name__)
32
+
33
+ def __init__(self, source, ordered_dict_cache=None, cache_size=10000, refresh=False, feature_extractor=lambda x: x,
34
+ log=100000):
35
+ self.source = source
36
+ self.ordered_dict = ordered_dict_cache
37
+ self.cache_size = cache_size
38
+ self.refresh = refresh
39
+ self.feature_extractor = feature_extractor
40
+ self.log = log
41
+
42
+ @staticmethod
43
+ def __prop_dict(dict_):
44
+ for k, v in dict_.items():
45
+ dict_[k] = v + 1
46
+
47
+ def __iter__(self):
48
+ if self.ordered_dict is None:
49
+ from collections import OrderedDict
50
+ self.__cache = OrderedDict()
51
+ else:
52
+ self.__cache = self.ordered_dict
53
+ for i, entry in enumerate(self.source):
54
+ if self.log and i % self.log == 0:
55
+ stats = Series(list(self.__cache.values()))
56
+ self.logger.info('[%s] Lifetime stats on iter %d: %s',
57
+ str(self.__class__.__name__), i, repr(stats.describe()))
58
+
59
+ f_entry = self.feature_extractor(entry)
60
+ if f_entry in self.__cache:
61
+ self.__prop_dict(self.__cache)
62
+ if self.refresh:
63
+ self.__cache.move_to_end(f_entry)
64
+ else:
65
+ self.__cache.__setitem__(f_entry, 0)
66
+ if self.cache_size is not None:
67
+ while len(self.__cache) > self.cache_size:
68
+ self.__cache.popitem(last=False)
69
+ yield entry
70
+
71
+
72
+ class JsonFieldSource(Source):
73
+
74
+ def __init__(self, json_source, key, default=None):
75
+ self.json = json_source
76
+ self.key = key
77
+ self.default = default
78
+
79
+ def __iter__(self):
80
+ for json_entry in self.json:
81
+ if self.key in json_entry:
82
+ yield json_entry[self.key]
83
+ else:
84
+ yield self.default
85
+
86
+
87
+ class FilteringSource(Source):
88
+
89
+ def __init__(self, source, condition):
90
+ self.source = source
91
+ self.condition = condition
92
+
93
+ def __iter__(self):
94
+ for entry in self.source:
95
+ if self.condition(entry):
96
+ yield entry
97
+
98
+
99
+ class MappingSource(Source):
100
+
101
+ def __init__(self, source, function):
102
+ self.source = source
103
+ self.function = function
104
+
105
+ def __iter__(self):
106
+ for entry in self.source:
107
+ yield self.function(entry)
108
+
109
+
110
+ class SplittingSource(Source):
111
+
112
+ def __init__(self, source, splitting_function):
113
+ self.source = source
114
+ self.function = splitting_function
115
+
116
+ def __iter__(self):
117
+ for entry in self.source:
118
+ for item in self.function(entry):
119
+ yield item
120
+
121
+
122
+ class FileLineSource(Source):
123
+
124
+ def __init__(self, text_file_path, encoding='utf8'):
125
+ self.source_file = text_file_path
126
+ self.encoding = encoding
127
+
128
+ def __iter__(self):
129
+ with io.open(self.source_file, mode='rt', encoding=self.encoding) as in_file:
130
+ for line in in_file:
131
+ line = line.strip()
132
+ if len(line) <= 0:
133
+ continue
134
+ yield line
135
+
136
+
137
+ class OpensubtitlesSentenceSource(Source):
138
+ DEFAULT_SENTENCE_TAG = '<s>'
139
+
140
+ def __init__(self, line_source, sentence_tag=None):
141
+ self.source = line_source
142
+ if sentence_tag:
143
+ self.sentence_tag = sentence_tag
144
+ else:
145
+ self.sentence_tag = OpensubtitlesSentenceSource.DEFAULT_SENTENCE_TAG
146
+
147
+ def __iter__(self):
148
+ for line in self.source:
149
+ for sentence in line.split(self.sentence_tag):
150
+ yield sentence.strip()
151
+
152
+
153
+ class BZipDocumentSource(Source):
154
+
155
+ def __init__(self, bzip_filepath, text_preprocessor=None):
156
+ self.source_filepath = bzip_filepath
157
+ self.text_preprocessor = text_preprocessor
158
+ super().__init__()
159
+
160
+ def __iter__(self):
161
+ with bz2.BZ2File(self.source_filepath, 'rtU') as in_bz:
162
+ for line in in_bz:
163
+ text = line
164
+ if self.text_preprocessor:
165
+ text = self.text_preprocessor.preprocess(text)
166
+ tokens = text.split()
167
+ yield tokens
168
+
169
+
170
+ class GZipDocumentSource(Source):
171
+
172
+ def __init__(self, gzip_filepath, text_preprocessor=None):
173
+ self.source_filepath = gzip_filepath
174
+ self.text_preprocessor = text_preprocessor
175
+ super().__init__()
176
+
177
+ def __iter__(self):
178
+ with gzip.GzipFile(self.source_filepath, 'rU') as in_bz:
179
+ for line in in_bz:
180
+ text = line
181
+ if self.text_preprocessor:
182
+ text = self.text_preprocessor.preprocess(text)
183
+ tokens = text.split()
184
+ yield tokens
@@ -0,0 +1,179 @@
1
+ # -*- coding: utf-8 -*-
2
+ import io
3
+ import json
4
+ import logging
5
+
6
+ from pandas import DataFrame
7
+ from pandas import read_csv, read_json
8
+
9
+ from pynlple.data.jsonsource import FileJsonDataSource
10
+ from pynlple.data.source import Source
11
+
12
+ logger = logging.getLogger(__file__)
13
+
14
+
15
+ class DataframeSource(Source):
16
+
17
+ def __init__(self, dataframe):
18
+ self.dataframe = dataframe
19
+
20
+ def get_dataframe(self):
21
+ return self.dataframe
22
+
23
+ def set_dataframe(self, dataframe):
24
+ self.dataframe = dataframe
25
+
26
+
27
+ class TsvDataframeSource(Source):
28
+
29
+ def __init__(self, dataframe_path, separator='\t', quote=0, escape_char='\\', column_names=None,
30
+ index_column_names=None, fill_na_map=None, encoding='utf-8', index_columns=None):
31
+ self.path = dataframe_path
32
+ self.separator = separator
33
+ self.column_names = column_names
34
+ self.na_map = fill_na_map
35
+ self.encoding = encoding
36
+ self.index_columns = index_columns
37
+ self.index_column_names = index_column_names
38
+ self.quote = quote
39
+ self.escape_char = escape_char
40
+
41
+ def get_dataframe(self):
42
+ # TODO: Eats \r\n and spits sole \n in literal value strings instead
43
+ if self.column_names:
44
+ header = None
45
+ names = self.column_names
46
+ else:
47
+ header = 'infer'
48
+ names = None
49
+
50
+ dataframe = read_csv(self.path,
51
+ sep=self.separator,
52
+ header=header,
53
+ names=names,
54
+ quoting=self.quote,
55
+ escapechar=self.escape_char,
56
+ encoding=self.encoding)
57
+ if self.index_columns:
58
+ dataframe.set_index(keys=self.index_columns, inplace=True)
59
+ if self.na_map:
60
+ for key, value in self.na_map.items():
61
+ dataframe[key].fillna(value, inplace=True)
62
+ logger.debug('Read: {} rows from {}'.format(str(len(dataframe.index)), self.path))
63
+ return dataframe
64
+
65
+ def set_dataframe(self, dataframe):
66
+ if self.column_names:
67
+ names = self.column_names
68
+ else:
69
+ names = True
70
+
71
+ if self.index_column_names:
72
+ include_index = True
73
+ index_names = self.index_column_names
74
+ else:
75
+ include_index = False
76
+ index_names = None
77
+
78
+ dataframe.to_csv(self.path,
79
+ sep=self.separator,
80
+ header=names,
81
+ index=include_index,
82
+ index_label=index_names,
83
+ quoting=self.quote,
84
+ escapechar=self.escape_char,
85
+ encoding=self.encoding)
86
+ logger.debug('Written: {} rows from {}'.format(str(len(dataframe.index)), self.path))
87
+
88
+
89
+ class JsonFileDataframeSource(Source):
90
+ FILE_READ_METHOD = 'rt'
91
+ FILE_WRITE_METHOD = 'wt'
92
+ DEFAULT_ORIENT = 'records'
93
+ DEFAULT_ENCODING = 'utf-8'
94
+
95
+ def __init__(self, json_file_path, fill_na_map=None, index_columns=None,
96
+ encoding=DEFAULT_ENCODING, orient=DEFAULT_ORIENT):
97
+ self.json_file_path = json_file_path
98
+ self.na_map = fill_na_map
99
+ self.index_columns = index_columns
100
+ self.encoding = encoding
101
+ self.orient = orient
102
+
103
+ def get_dataframe(self):
104
+ with io.open(self.json_file_path, JsonFileDataframeSource.FILE_READ_METHOD,
105
+ encoding=self.encoding) as data_file:
106
+ # TODO: implement fill_na_map
107
+ df = read_json(data_file, orient=self.orient, encoding=JsonFileDataframeSource.DEFAULT_ENCODING)
108
+ return df
109
+
110
+ def set_dataframe(self, dataframe):
111
+ with io.open(self.json_file_path, JsonFileDataframeSource.FILE_WRITE_METHOD,
112
+ encoding=self.encoding) as data_file:
113
+ # TODO: implement fill_na_map
114
+ json.dump(dataframe.reset_index().to_dict(orient=self.orient), data_file, ensure_ascii=False, indent=1)
115
+
116
+
117
+ class JsonNullableFileDataframeSource(Source):
118
+ DEFAULT_ENCODING = 'utf-8'
119
+
120
+ def __init__(self, json_file_path, keys=None, fill_na_map=None, index_columns=None,
121
+ encoding=DEFAULT_ENCODING):
122
+ self.__source = JsonDataframeSource(FileJsonDataSource(file_path=json_file_path, encoding_str=encoding),
123
+ keys=keys, fill_na_map=fill_na_map,
124
+ index_columns=index_columns)
125
+
126
+ def get_dataframe(self):
127
+ return self.__source.get_dataframe()
128
+
129
+ def set_dataframe(self, dataframe):
130
+ self.__source.set_dataframe(dataframe)
131
+
132
+
133
+ class JsonDataframeSource(Source):
134
+
135
+ def __init__(self, json_source, keys=None, fill_na_map=None, index_columns=None):
136
+ self.json_source = json_source
137
+ self.keys = keys
138
+ self.na_map = fill_na_map
139
+ self.index_columns = index_columns
140
+
141
+ def get_dataframe(self):
142
+ extracted_entries = list()
143
+ for json_object in self.json_source.get_data():
144
+ entry = dict()
145
+ if self.keys:
146
+ for key in self.keys:
147
+ if key not in json_object:
148
+ entry[key] = self.na_map[key]
149
+ else:
150
+ entry[key] = json_object[key]
151
+ else:
152
+ for key in json_object:
153
+ entry[key] = json_object[key]
154
+ if self.na_map:
155
+ for key, value in self.na_map:
156
+ if key not in entry:
157
+ entry[key] = value
158
+ extracted_entries.append(entry)
159
+ dataframe = DataFrame(extracted_entries)
160
+ if self.index_columns:
161
+ dataframe.set_index(keys=self.index_columns, inplace=True)
162
+ if self.na_map:
163
+ for key, value in self.na_map.items():
164
+ dataframe.loc[:, key].fillna(value, inplace=True)
165
+ logger.debug('Read: {} rows from {}'.format(str(len(dataframe.index)), repr(self.json_source)))
166
+ return dataframe
167
+
168
+ def set_dataframe(self, dataframe):
169
+ entries = dataframe.reset_index().to_dict(orient='records')
170
+ for entry in entries:
171
+ if self.keys:
172
+ for key in list(entry.keys()):
173
+ if key not in self.keys:
174
+ entry.pop(key, None)
175
+ if self.na_map:
176
+ for key, value in self.na_map:
177
+ if key not in entry:
178
+ entry[key] = value
179
+ self.json_source.set_data(entries)
@@ -0,0 +1,33 @@
1
+ # -*- coding: utf-8 -*-
2
+ from pynlple.data.source import Source
3
+ from pynlple.exceptions import DataSourceException
4
+ from pynlple.module import is_folder, is_file, list_dir, append_paths
5
+
6
+
7
+ class FilePathSource(Source):
8
+ """Class for providing filepaths from data folders."""
9
+
10
+ def __init__(self, paths, extension_suffix=None):
11
+ self.paths = paths
12
+ self.extension = extension_suffix
13
+
14
+ def get_files(self):
15
+ accumulated_paths = list()
16
+ for path in self.paths:
17
+ if is_file(path):
18
+ accumulated_paths.append(path)
19
+ elif is_folder(path):
20
+ files = list_dir(path)
21
+ if self.extension:
22
+ for file in filter(lambda f: f.endswith(self.extension), files):
23
+ accumulated_paths.append(append_paths(path, file))
24
+ else:
25
+ for file in files:
26
+ accumulated_paths.append(append_paths(path, file))
27
+ else:
28
+ raise DataSourceException('Path {0} does not exist/is neither file nor folder!'.format(path))
29
+ return accumulated_paths
30
+
31
+ def __iter__(self):
32
+ for file_ in self.get_files():
33
+ yield file_