iterabledata 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. examples/converter/convert.py +12 -0
  2. examples/simplewiki/convert.py +32 -0
  3. examples/simplewiki/convert_parquet.py +29 -0
  4. examples/simplewiki/enrich.py +48 -0
  5. iterable/__init__.py +10 -0
  6. iterable/base.py +182 -0
  7. iterable/codecs/__init__.py +7 -0
  8. iterable/codecs/brotlicodec.py +28 -0
  9. iterable/codecs/bz2codec.py +21 -0
  10. iterable/codecs/gzipcodec.py +26 -0
  11. iterable/codecs/lz4codec.py +26 -0
  12. iterable/codecs/lzmacodec.py +39 -0
  13. iterable/codecs/rawcodec.py +23 -0
  14. iterable/codecs/zipcodec.py +34 -0
  15. iterable/codecs/zstdcodec.py +34 -0
  16. iterable/convert/__init__.py +1 -0
  17. iterable/convert/core.py +73 -0
  18. iterable/datatypes/__init__.py +11 -0
  19. iterable/datatypes/avro.py +45 -0
  20. iterable/datatypes/bsonf.py +47 -0
  21. iterable/datatypes/csv.py +139 -0
  22. iterable/datatypes/json.py +54 -0
  23. iterable/datatypes/jsonl.py +69 -0
  24. iterable/datatypes/orc.py +103 -0
  25. iterable/datatypes/parquet.py +126 -0
  26. iterable/datatypes/picklef.py +58 -0
  27. iterable/datatypes/xls.py +103 -0
  28. iterable/datatypes/xlsx.py +90 -0
  29. iterable/datatypes/xml.py +82 -0
  30. iterable/datatypes/zipped.py +70 -0
  31. iterable/datatypes/zipxml.py +42 -0
  32. iterable/engines/__init__.py +1 -0
  33. iterable/engines/duckdb.py +78 -0
  34. iterable/helpers/__init__.py +0 -0
  35. iterable/helpers/detect.py +149 -0
  36. iterable/helpers/schema.py +207 -0
  37. iterable/helpers/utils.py +301 -0
  38. iterable/pipeline/__init__.py +1 -0
  39. iterable/pipeline/core.py +62 -0
  40. iterabledata-1.0.7.dist-info/METADATA +521 -0
  41. iterabledata-1.0.7.dist-info/RECORD +44 -0
  42. iterabledata-1.0.7.dist-info/WHEEL +5 -0
  43. iterabledata-1.0.7.dist-info/licenses/LICENSE +21 -0
  44. iterabledata-1.0.7.dist-info/top_level.txt +2 -0
@@ -0,0 +1,12 @@
1
+ import sys
2
+ from iterable.convert.core import convert
3
+ from tqdm import tqdm
4
+
5
+
6
+
7
+ def run():
8
+ convert(fromfile=sys.argv[1], tofile=sys.argv[2], silent=False, use_totals=True)
9
+
10
+
11
+ if __name__ == "__main__":
12
+ run()
@@ -0,0 +1,32 @@
1
+ import os
2
+ from iterable.datatypes import XMLIterable, JSONLinesIterable
3
+ from iterable.codecs import BZIP2Codec, ZSTDCodec
4
+ from tqdm import tqdm
5
+
6
+ from itertools import (takewhile, repeat)
7
+
8
+ def rawincount(filename):
9
+ f = open(filename, 'rb')
10
+ bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
11
+ return sum(buf.count(b'\n') for buf in bufgen)
12
+
13
+ RAW_FILE = 'data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2'
14
+ RAW_INDEX = 'data/raw/simplewiki-latest-pages-articles-multistream-index.txt'
15
+ RESULT_ZSTD_JSONL_FILE = 'data/raw/simplewiki.jsonl.zst'
16
+
17
+ def run():
18
+ codecobj = BZIP2Codec(RAW_FILE, mode='r')
19
+ iterable = XMLIterable(codec=codecobj, tagname='page')
20
+ wrcodecobj = ZSTDCodec(RESULT_ZSTD_JSONL_FILE, mode='w')
21
+ witerable = JSONLinesIterable(codec=wrcodecobj, mode='w')
22
+
23
+ num = rawincount(RAW_INDEX)
24
+ n = 0
25
+ for row in tqdm(iterable, total=num, desc='Converting data'):
26
+ n += 1
27
+ witerable.write(row)
28
+ iterable.close()
29
+ witerable.close()
30
+
31
+ if __name__ == "__main__":
32
+ run()
@@ -0,0 +1,29 @@
1
+ import os
2
+ import sys
3
+ from iterable.datatypes import JSONLinesIterable, ParquetIterable
4
+ from iterable.codecs import ZSTDCodec
5
+
6
+ RAW_FILE = sys.argv[1]
7
+ RESULT_PARQUET_FILE = sys.argv[2]
8
+ BATCH_SIZE = 10000
9
+
10
+ def run():
11
+ codec_obj = ZSTDCodec(RAW_FILE, mode='r')
12
+ iterable = JSONLinesIterable(codec=codec_obj)
13
+ writerable = ParquetIterable(RESULT_PARQUET_FILE, mode='w', use_pandas=False, adapt_schema=True, batch_size=BATCH_SIZE)
14
+
15
+ n = 0
16
+ rows = []
17
+ for row in iterable:
18
+ n += 1
19
+ rows.append(row)
20
+ if n % BATCH_SIZE == 0:
21
+ writerable.write_bulk(rows)
22
+ rows = []
23
+ if len(rows) > 0:
24
+ writerable.write_bulk(rows)
25
+ iterable.close()
26
+ writerable.close()
27
+
28
+ if __name__ == "__main__":
29
+ run()
@@ -0,0 +1,48 @@
1
+ from itertools import (takewhile, repeat)
2
+ from iterable.datatypes import JSONLinesIterable
3
+ from iterable.codecs import ZSTDCodec
4
+ import json
5
+ from tqdm import tqdm
6
+ import csv
7
+ import wikitextparser as wtp
8
+
9
+ CATEGORY_TEXT = 'Category'
10
+
11
+ def clean_text(s):
12
+ """Cleans up wikitext"""
13
+ s = wtp.remove_markup(s.replace('<br/>', ' ')).strip()
14
+ return s
15
+
16
+ def rawincount(filename):
17
+ f = open(filename, 'rb')
18
+ bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
19
+ return sum(buf.count(b'\n') for buf in bufgen)
20
+
21
+ RAW_INDEX = 'data/raw/simplewiki-latest-pages-articles-multistream-index.txt'
22
+ RESULT_ZSTD_JSONL_FILE = 'data/raw/simplewiki.jsonl.zst'
23
+ ENRICHED_ZSTD_JSONL_FILE = 'data/raw/simplewiki_prepared.jsonl.zst'
24
+
25
+ def run():
26
+ total = rawincount(RAW_INDEX)
27
+ in_codec = ZSTDCodec(RESULT_ZSTD_JSONL_FILE, mode='r')
28
+ out_codec = ZSTDCodec(ENRICHED_ZSTD_JSONL_FILE, mode='w')
29
+ in_iterable = JSONLinesIterable(codec=in_codec, mode='w')
30
+ out_iterable = JSONLinesIterable(codec=out_codec, mode='w')
31
+
32
+ for data in tqdm(in_iterable, total=total):
33
+ if data['revision']['text'] is not None and '#text' in data['revision']['text'].keys():
34
+ p = wtp.parse(data['revision']['text']['#text'])
35
+ tn = 0
36
+ categories = []
37
+ for w in p.wikilinks:
38
+ if w.title.find(CATEGORY_TEXT + ':') > -1:
39
+ categories.append(w.title.split(CATEGORY_TEXT + ':')[1])
40
+ data['categories'] = categories
41
+ out_iterable.write(data)
42
+ in_iterable.close()
43
+ out_iterable.close()
44
+ pass
45
+
46
+
47
+ if __name__ == "__main__":
48
+ run()
iterable/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ __author__ = "Ivan Begtin"
2
+ __version__ = "1.0.7"
3
+ __licence__ = "MIT"
4
+ __doc__ = "Iterable data processing Python library"
5
+
6
+
7
+
8
+ from .helpers.detect import open_iterable
9
+
10
+ open_it = open_iterable
iterable/base.py ADDED
@@ -0,0 +1,182 @@
1
+ # -*- coding: utf-8 -*-
2
+ ITERABLE_TYPE_STREAM = 10
3
+ ITERABLE_TYPE_FILE = 20
4
+ ITERABLE_TYPE_CODEC = 30
5
+ DEFAULT_BULK_NUMBER = 100
6
+
7
+ import io
8
+ import typing
9
+
10
+
11
+ class BaseCodec:
12
+ """Basic codec class"""
13
+ def __init__(self, filename: str = None, fileobj: typing.IO = None, mode: str = 'r', open_it: bool = False, options:dict = {}):
14
+ self._fileobj = fileobj
15
+ self.filename = filename
16
+ self.mode = mode
17
+ if open_it:
18
+ self.open()
19
+
20
+ if len(options) > 0:
21
+ for k, v in options.items():
22
+ setattr(self, k, v)
23
+ pass
24
+
25
+ @staticmethod
26
+ def fileexts():
27
+ """Return file extensions"""
28
+ raise NotImplementedError
29
+
30
+
31
+ def reset(self):
32
+ """Reset file"""
33
+ # if self._fileobj.seekable():
34
+ # self._fileobj.seek(0)
35
+ # else:
36
+ self.close()
37
+ self.open()
38
+
39
+ def open(self):
40
+ raise NotImplementedError
41
+
42
+ def fileobj(self):
43
+ """Return file object"""
44
+ return self._fileobj
45
+
46
+ def close(self):
47
+ """Close codec. Not implemented by default"""
48
+ raise NotImplementedError
49
+
50
+ def textIO(self, encoding:str = 'utf8'):
51
+ """Return text wrapper over binary stream"""
52
+ return io.TextIOWrapper(self.fileobj(), encoding=encoding, write_through=True)
53
+
54
+
55
+ class BaseIterable:
56
+ """Base iterable data class"""
57
+ def __init__(self):
58
+ pass
59
+
60
+ def reset(self):
61
+ """Reset iterator"""
62
+ raise NotImplementedError
63
+
64
+ @staticmethod
65
+ def id():
66
+ """Identifier of selected destination"""
67
+ raise NotImplementedError
68
+
69
+ @staticmethod
70
+ def has_totals():
71
+ """Has totals. Default: False"""
72
+ return False
73
+
74
+
75
+
76
+ def read(self, skip_empty:bool = True):
77
+ """Read single record"""
78
+ raise NotImplementedError
79
+
80
+ def read_bulk(self, num:int = DEFAULT_BULK_NUMBER):
81
+ """Read multiple records"""
82
+ raise NotImplementedError
83
+
84
+ @staticmethod
85
+ def is_flatonly():
86
+ """Is source flat by only. Default: False"""
87
+ return False
88
+
89
+ def is_flat(self):
90
+ """Is source flat. Default: """
91
+ if self.__class__().is_flatonly():
92
+ return True
93
+ raise NotImplementedError
94
+
95
+ def is_streaming(self):
96
+ """Is source streaming. Default: False"""
97
+ return False
98
+
99
+ def __next__(self):
100
+ return self.read()
101
+
102
+ def __iter__(self):
103
+ # self.reset()
104
+ return self
105
+
106
+ def write(self, record: dict):
107
+ """Write single record"""
108
+ raise NotImplementedError
109
+
110
+ def write_bulk(self, records: list[dict]):
111
+ """Write multiple records"""
112
+ raise NotImplementedError
113
+
114
+
115
+ class BaseFileIterable(BaseIterable):
116
+ """Basic file iterable"""
117
+ datamode = 'text'
118
+
119
+ def __init__(self, filename:str = None, stream:typing.IO = None, codec: BaseCodec = None, binary:bool = False, encoding:str = 'utf8', noopen:bool = False, mode:str = 'r', options:dict = {}):
120
+ """Init basic file iterable"""
121
+ self.filename = filename
122
+ self.noopen = noopen
123
+ self.encoding = encoding
124
+ self.binary = binary
125
+ self.mode = mode
126
+ self.codec = codec
127
+ if stream is not None:
128
+ self.stype = ITERABLE_TYPE_STREAM
129
+ elif filename is not None:
130
+ self.stype = ITERABLE_TYPE_FILE
131
+ elif codec is not None:
132
+ self.stype = ITERABLE_TYPE_CODEC
133
+ self.fobj = None
134
+
135
+ if self.stype == ITERABLE_TYPE_FILE:
136
+ if not noopen:
137
+ self.open()
138
+ elif self.stype == ITERABLE_TYPE_STREAM:
139
+ self.fobj = stream
140
+ elif self.stype == ITERABLE_TYPE_CODEC:
141
+ if not noopen:
142
+ self.fobj = self.codec.open()
143
+ if self.datamode == 'text':
144
+ self.fobj = self.codec.textIO(encoding=self.encoding)
145
+ if len(options) > 0:
146
+ for k, v in options.items():
147
+ setattr(self, k, v)
148
+
149
+
150
+ def open(self):
151
+ """Open file as file data source"""
152
+ if self.stype == ITERABLE_TYPE_FILE:
153
+ self.fobj = open(self.filename, self.mode + 'b') if self.binary else open(self.filename, self.mode, encoding=self.encoding)
154
+ return self.fobj
155
+ else:
156
+ raise NotImplementedError
157
+
158
+
159
+ def reset(self):
160
+ """Reset file using seek(0)"""
161
+ if self.stype == ITERABLE_TYPE_FILE:
162
+ if self.fobj is not None:
163
+ self.fobj.seek(0)
164
+ elif self.stype == ITERABLE_TYPE_CODEC:
165
+ if self.fobj is not None and self.mode not in ['w', 'wb']:
166
+ self.codec.reset()
167
+ self.fobj = self.codec.fileobj()
168
+ if self.datamode == 'text':
169
+ self.fobj = self.codec.textIO(encoding=self.encoding)
170
+ # if self.fobj.seekable():
171
+ # self.fobj.seek(0)
172
+
173
+
174
+
175
+ def close(self):
176
+ """Close file as file data source"""
177
+ if self.stype == ITERABLE_TYPE_FILE:
178
+ if self.fobj is not None:
179
+ self.fobj.close()
180
+ elif self.stype == ITERABLE_TYPE_CODEC:
181
+ if self.codec is not None:
182
+ self.codec.close()
@@ -0,0 +1,7 @@
1
+ from .bz2codec import BZIP2Codec
2
+ from .gzipcodec import GZIPCodec
3
+ from .lzmacodec import LZMACodec
4
+ from .lz4codec import LZ4Codec
5
+ from .zipcodec import ZIPCodec
6
+ from .zstdcodec import ZSTDCodec
7
+ from .brotlicodec import BrotliCodec
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+ import typing
3
+ from ..base import BaseCodec
4
+
5
+ import brotli_file
6
+
7
+ BROTLI_DEFAULT_COMPRESSION_LEVEL = 11
8
+
9
+ class BrotliCodec(BaseCodec):
10
+ def __init__(self, filename:str, compression_level:int = BROTLI_DEFAULT_COMPRESSION_LEVEL, mode:str = 'r', open_it:bool = False, options:dict = {}):
11
+ "Code to support Brotli compression"
12
+ self.compression_level = compression_level
13
+ super(BrotliCodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
14
+
15
+ def open(self) -> brotli_file.BrotliFile:
16
+ self._fileobj = brotli_file.open(self.filename, mode=self.mode, quality=self.compression_level)
17
+ return self._fileobj
18
+
19
+ def close(self):
20
+ self._fileobj.close()
21
+
22
+ @staticmethod
23
+ def id():
24
+ return 'brotli'
25
+
26
+ @staticmethod
27
+ def fileexts() -> list[str]:
28
+ return ['br', 'brotli']
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+ import typing
3
+ from ..base import BaseCodec
4
+
5
+ import bz2
6
+
7
+ class BZIP2Codec(BaseCodec):
8
+ def __init__(self, filename:str, compression_level:int = 5, mode:str = 'r', open_it:bool = False, options:dict = {}):
9
+ self.compression_level = compression_level
10
+ super(BZIP2Codec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
11
+
12
+ def open(self) -> bz2.BZ2File:
13
+ self._fileobj = bz2.open(self.filename, self.mode, compresslevel=self.compression_level)
14
+ return self._fileobj
15
+
16
+ def close(self):
17
+ self._fileobj.close()
18
+
19
+ @staticmethod
20
+ def fileexts() -> list[str]:
21
+ return ['bz2',]
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+ import typing
3
+ from ..base import BaseCodec
4
+
5
+ import gzip
6
+
7
+ class GZIPCodec(BaseCodec):
8
+ def __init__(self, filename:str, compression_level:int = 5, mode:str = 'r', open_it:bool = False, options:dict = {}):
9
+ self.compression_level = compression_level
10
+ super(GZIPCodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
11
+
12
+ def open(self) -> gzip.GzipFile:
13
+ self._fileobj = gzip.GzipFile(filename=self.filename, mode=self.mode, compresslevel=self.compression_level)
14
+ return self._fileobj
15
+
16
+ def close(self):
17
+ self._fileobj.close()
18
+
19
+ @staticmethod
20
+ def id():
21
+ return 'gzip'
22
+
23
+
24
+ @staticmethod
25
+ def fileexts() -> list[str]:
26
+ return ['gz',]
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+ import typing
3
+ from ..base import BaseCodec
4
+
5
+ import lz4.frame
6
+
7
+ class LZ4Codec(BaseCodec):
8
+ def __init__(self, filename:str, compression_level:int = lz4.frame.COMPRESSIONLEVEL_MINHC, mode:str = 'r', open_it:bool = False, options:dict = {}):
9
+ self.compression_level = compression_level
10
+ super(LZ4Codec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
11
+
12
+ def open(self) -> lz4.frame.LZ4FrameFile:
13
+ self._fileobj = lz4.frame.open(self.filename, mode=self.mode, compression_level=self.compression_level)
14
+ return self._fileobj
15
+
16
+ def close(self):
17
+ self._fileobj.close()
18
+
19
+ @staticmethod
20
+ def id():
21
+ return 'lz4'
22
+
23
+
24
+ @staticmethod
25
+ def fileexts() -> list[str]:
26
+ return ['lz4',]
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+ import typing
3
+ from ..base import BaseCodec
4
+
5
+ import lzma
6
+
7
+ LZMA_FILTERS = [
8
+ {"id": lzma.FILTER_DELTA, "dist": 5},
9
+ {"id": lzma.FILTER_LZMA2, "preset": 7 | lzma.PRESET_EXTREME},
10
+ ]
11
+
12
+ class LZMACodec(BaseCodec):
13
+ def __init__(self, filename:str, compression_level:int = 5, mode:str = 'r', open_it:bool = False, options:dict = {}):
14
+ self.compression_level = compression_level
15
+ super(LZMACodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
16
+
17
+ def open(self) -> lzma.LZMAFile:
18
+ filters = LZMA_FILTERS
19
+ filters[0]['dist'] = self.compression_level
20
+ self._fileobj = lzma.LZMAFile(self.filename, mode=self.mode, format=lzma.FORMAT_XZ)#, filters=filters)
21
+ return self._fileobj
22
+
23
+
24
+ def reset(self):
25
+ if self.mode in ['w', 'wb']:
26
+ pass
27
+ else:
28
+ super(LZMACodec, self).reset()
29
+
30
+ def close(self):
31
+ self._fileobj.close()
32
+
33
+ @staticmethod
34
+ def id():
35
+ return 'xz'
36
+
37
+ @staticmethod
38
+ def fileexts() -> list[str]:
39
+ return ['xz', 'lzma']
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+ import typing
3
+ from ..base import BaseCodec
4
+
5
+
6
+ class RAWCodec(BaseCodec):
7
+ def __init__(self, filename:str, mode:str = 'r', open_it:bool = False, options:dict = {}):
8
+ super(RAWCodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
9
+
10
+ def open(self) -> typing.IO:
11
+ self._fileobj = open(self.filename, self.mode)
12
+ return self._fileobj
13
+
14
+ def close(self):
15
+ self._fileobj.close()
16
+
17
+ @staticmethod
18
+ def id():
19
+ return 'raw'
20
+
21
+ @staticmethod
22
+ def fileexts() -> list[str]:
23
+ return None
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+ import typing
3
+ from ..base import BaseCodec
4
+
5
+ import zipfile
6
+
7
+ class ZIPCodec(BaseCodec):
8
+ def __init__(self, filename:str, compression_level:int = 5, mode:str = 'r', open_it:bool = False, options:dict={}):
9
+ self.compression_level = compression_level
10
+ if mode == 'rb':
11
+ mode = 'r'
12
+ self.filemode = 'rb'
13
+ else:
14
+ self.filemode = 'r'
15
+ super(ZIPCodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
16
+
17
+ def open(self) -> zipfile.ZipFile:
18
+ self._archiveobj = zipfile.ZipFile(self.filename, mode=self.mode)
19
+ fnames = self._archiveobj.namelist()
20
+ self._fileobj = self._archiveobj.open(fnames[0], self.filemode)
21
+ return self._fileobj
22
+
23
+ def close(self):
24
+ self._fileobj.close()
25
+ self._archiveobj.close()
26
+
27
+ @staticmethod
28
+ def id():
29
+ return 'zip'
30
+
31
+
32
+ @staticmethod
33
+ def fileexts() -> list[str]:
34
+ return ['zip',]
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+ import typing
3
+ from ..base import BaseCodec
4
+
5
+ import zstandard as zstd
6
+
7
+
8
+ # Zstandard file object doesn't support file seek so reset rewritten to close and open file
9
+
10
+ class ZSTDCodec(BaseCodec):
11
+ def __init__(self, filename:str, compression_level:int = 0, mode:str = 'rb', open_it:bool = False, options:dict={}):
12
+ self.compression_level = compression_level
13
+ rmode = 'rb' if mode in ['r', 'rb'] else 'wb'
14
+ super(ZSTDCodec, self).__init__(filename, mode=rmode , open_it=open_it, options=options)
15
+
16
+ def open(self) -> zstd.ZstdDecompressionReader:
17
+ self._fileobj = zstd.open(self.filename, mode=self.mode)
18
+ return self._fileobj
19
+
20
+ def reset(self):
21
+ self.close()
22
+ self._fileobj = self.open()
23
+
24
+ def close(self):
25
+ self._fileobj.close()
26
+
27
+ @staticmethod
28
+ def id():
29
+ return 'zst'
30
+
31
+
32
+ @staticmethod
33
+ def fileexts() -> list[str]:
34
+ return ['zstd', 'zst']
@@ -0,0 +1 @@
1
+ from .core import convert
@@ -0,0 +1,73 @@
1
+ # -*- coding: utf-8 -*-
2
+ from ..helpers.detect import open_iterable, is_flat
3
+ from ..helpers.utils import dict_generator, make_flat
4
+ from tqdm import tqdm
5
+ import logging
6
+ import time
7
+
8
+
9
+ DEFAULT_BATCH_SIZE = 50000
10
+ DEFAULT_HEADERS_DETECT_LIMIT = 1000
11
+
12
+ ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
13
+
14
+ DEFAULT_BATCH_SIZE = 50000
15
+
16
+
17
+ def convert(fromfile:str, tofile:str, iterableargs:dict={}, scan_limit:int=DEFAULT_HEADERS_DETECT_LIMIT, batch_size:int=DEFAULT_BATCH_SIZE, silent:bool=True, is_flatten:bool=False, use_totals:bool=False):
18
+ it_in = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
19
+ keys = []
20
+ n = 0
21
+ it = tqdm(it_in, total=scan_limit, desc='Schema analysis') if not silent else it_in
22
+ is_flat_output = is_flat(tofile)
23
+ if is_flat_output:
24
+ if not silent: logging.debug('Extracting schema')
25
+ for item in it:
26
+ if scan_limit is not None and n > scan_limit:
27
+ break
28
+ n += 1
29
+ if not is_flatten:
30
+ dk = dict_generator(item)
31
+ for i in dk:
32
+ k = ".".join(i[:-1])
33
+ if k not in keys:
34
+ keys.append(k)
35
+ else:
36
+ item = make_flat(item)
37
+ for k in item.keys():
38
+ if k not in keys:
39
+ keys.append(k)
40
+
41
+ it_in.reset()
42
+ if is_flat_output:
43
+ args = {'keys' : keys}
44
+ else:
45
+ args = {}
46
+ it_out = open_iterable(tofile, mode='w', iterableargs=args)
47
+
48
+ logging.debug('Converting data')
49
+ n = 0
50
+ if use_totals and it_in.has_totals():
51
+ totals = it_in.totals()
52
+ logging.debug(f'Total rows: {totals}')
53
+ it_in.reset()
54
+ it = tqdm(it_in, total=totals, desc='Converting') if not silent else it_in
55
+ else:
56
+ it = tqdm(it_in, desc='Converting') if not silent else it_in
57
+ batch = []
58
+ for row in it:
59
+ n += 1
60
+ if is_flatten:
61
+ for k in keys:
62
+ if k not in row.keys():
63
+ row[k] = None
64
+ batch.append(make_flat(row))
65
+ else:
66
+ batch.append(row)
67
+ if n % batch_size == 0:
68
+ it_out.write_bulk(batch)
69
+ batch = []
70
+ if len(batch) > 0:
71
+ it_out.write_bulk(batch)
72
+ it_in.close()
73
+ it_out.close()
@@ -0,0 +1,11 @@
1
+ from .avro import AVROIterable
2
+ from .bsonf import BSONIterable
3
+ from .csv import CSVIterable
4
+ from .orc import ORCIterable
5
+ from .parquet import ParquetIterable
6
+ from .picklef import PickleIterable
7
+ from .json import JSONIterable
8
+ from .jsonl import JSONLinesIterable
9
+ from .xls import XLSIterable
10
+ from .xlsx import XLSXIterable
11
+ from .xml import XMLIterable