PyPI - iterabledata - Versions diffs - 1.0.7__py3-none-any.whl - Mend

iterabledata 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

examples/converter/convert.py +12 -0
examples/simplewiki/convert.py +32 -0
examples/simplewiki/convert_parquet.py +29 -0
examples/simplewiki/enrich.py +48 -0
iterable/__init__.py +10 -0
iterable/base.py +182 -0
iterable/codecs/__init__.py +7 -0
iterable/codecs/brotlicodec.py +28 -0
iterable/codecs/bz2codec.py +21 -0
iterable/codecs/gzipcodec.py +26 -0
iterable/codecs/lz4codec.py +26 -0
iterable/codecs/lzmacodec.py +39 -0
iterable/codecs/rawcodec.py +23 -0
iterable/codecs/zipcodec.py +34 -0
iterable/codecs/zstdcodec.py +34 -0
iterable/convert/__init__.py +1 -0
iterable/convert/core.py +73 -0
iterable/datatypes/__init__.py +11 -0
iterable/datatypes/avro.py +45 -0
iterable/datatypes/bsonf.py +47 -0
iterable/datatypes/csv.py +139 -0
iterable/datatypes/json.py +54 -0
iterable/datatypes/jsonl.py +69 -0
iterable/datatypes/orc.py +103 -0
iterable/datatypes/parquet.py +126 -0
iterable/datatypes/picklef.py +58 -0
iterable/datatypes/xls.py +103 -0
iterable/datatypes/xlsx.py +90 -0
iterable/datatypes/xml.py +82 -0
iterable/datatypes/zipped.py +70 -0
iterable/datatypes/zipxml.py +42 -0
iterable/engines/__init__.py +1 -0
iterable/engines/duckdb.py +78 -0
iterable/helpers/__init__.py +0 -0
iterable/helpers/detect.py +149 -0
iterable/helpers/schema.py +207 -0
iterable/helpers/utils.py +301 -0
iterable/pipeline/__init__.py +1 -0
iterable/pipeline/core.py +62 -0
iterabledata-1.0.7.dist-info/METADATA +521 -0
iterabledata-1.0.7.dist-info/RECORD +44 -0
iterabledata-1.0.7.dist-info/WHEEL +5 -0
iterabledata-1.0.7.dist-info/licenses/LICENSE +21 -0
iterabledata-1.0.7.dist-info/top_level.txt +2 -0

examples/converter/convert.py ADDED Viewed

@@ -0,0 +1,12 @@
+import sys
+from iterable.convert.core import convert
+from tqdm import tqdm
+def run():
+        convert(fromfile=sys.argv[1], tofile=sys.argv[2], silent=False, use_totals=True)
+if __name__ == "__main__":
+        run()

examples/simplewiki/convert.py ADDED Viewed

@@ -0,0 +1,32 @@
+import os
+from iterable.datatypes import XMLIterable, JSONLinesIterable
+from iterable.codecs import BZIP2Codec, ZSTDCodec
+from tqdm import tqdm
+from itertools import (takewhile, repeat)
+def rawincount(filename):
+    f = open(filename, 'rb')
+    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
+    return sum(buf.count(b'\n') for buf in bufgen)
+RAW_FILE = 'data/raw/simplewiki-latest-pages-articles-multistream.xml.bz2'
+RAW_INDEX = 'data/raw/simplewiki-latest-pages-articles-multistream-index.txt'
+RESULT_ZSTD_JSONL_FILE = 'data/raw/simplewiki.jsonl.zst'
+def run():
+        codecobj = BZIP2Codec(RAW_FILE, mode='r')
+        iterable = XMLIterable(codec=codecobj, tagname='page')
+        wrcodecobj = ZSTDCodec(RESULT_ZSTD_JSONL_FILE, mode='w')
+        witerable = JSONLinesIterable(codec=wrcodecobj, mode='w')
+        num = rawincount(RAW_INDEX)
+        n = 0
+        for row in tqdm(iterable, total=num, desc='Converting data'):
+                n += 1
+                witerable.write(row)
+        iterable.close()
+        witerable.close()
+if __name__ == "__main__":
+        run()

examples/simplewiki/convert_parquet.py ADDED Viewed

@@ -0,0 +1,29 @@
+import os
+import sys
+from iterable.datatypes import JSONLinesIterable, ParquetIterable
+from iterable.codecs import ZSTDCodec
+RAW_FILE = sys.argv[1]
+RESULT_PARQUET_FILE = sys.argv[2]
+BATCH_SIZE = 10000
+def run():
+        codec_obj = ZSTDCodec(RAW_FILE, mode='r')
+        iterable = JSONLinesIterable(codec=codec_obj)
+        writerable = ParquetIterable(RESULT_PARQUET_FILE, mode='w', use_pandas=False, adapt_schema=True, batch_size=BATCH_SIZE)
+        n = 0
+        rows = []
+        for row in iterable:
+                n += 1
+                rows.append(row)
+                if n % BATCH_SIZE == 0:
+                        writerable.write_bulk(rows)
+                        rows = []
+        if len(rows) > 0:
+                writerable.write_bulk(rows)
+        iterable.close()
+        writerable.close()
+if __name__ == "__main__":
+        run()

examples/simplewiki/enrich.py ADDED Viewed

@@ -0,0 +1,48 @@
+from itertools import (takewhile, repeat)
+from iterable.datatypes import JSONLinesIterable
+from iterable.codecs import ZSTDCodec
+import json
+from tqdm import tqdm
+import csv
+import wikitextparser as wtp
+CATEGORY_TEXT = 'Category'
+def clean_text(s):
+    """Cleans up wikitext"""
+    s = wtp.remove_markup(s.replace('<br/>', ' ')).strip()
+    return s
+def rawincount(filename):
+    f = open(filename, 'rb')
+    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
+    return sum(buf.count(b'\n') for buf in bufgen)
+RAW_INDEX = 'data/raw/simplewiki-latest-pages-articles-multistream-index.txt'
+RESULT_ZSTD_JSONL_FILE = 'data/raw/simplewiki.jsonl.zst'
+ENRICHED_ZSTD_JSONL_FILE = 'data/raw/simplewiki_prepared.jsonl.zst'
+def run():
+    total = rawincount(RAW_INDEX)
+    in_codec = ZSTDCodec(RESULT_ZSTD_JSONL_FILE, mode='r')
+    out_codec = ZSTDCodec(ENRICHED_ZSTD_JSONL_FILE, mode='w')
+    in_iterable = JSONLinesIterable(codec=in_codec, mode='w')
+    out_iterable = JSONLinesIterable(codec=out_codec, mode='w')
+    for data in tqdm(in_iterable, total=total):
+        if data['revision']['text'] is not None and '#text' in data['revision']['text'].keys():
+            p = wtp.parse(data['revision']['text']['#text'])
+            tn = 0
+            categories = []
+            for w in p.wikilinks:
+                if w.title.find(CATEGORY_TEXT + ':') > -1:
+                    categories.append(w.title.split(CATEGORY_TEXT + ':')[1])
+            data['categories'] = categories
+            out_iterable.write(data)
+    in_iterable.close()
+    out_iterable.close()
+    pass
+if __name__ == "__main__":
+    run()

iterable/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+__author__ = "Ivan Begtin"
+__version__ = "1.0.7"
+__licence__ = "MIT"
+__doc__ = "Iterable data processing Python library"
+from .helpers.detect import open_iterable
+open_it = open_iterable

iterable/base.py ADDED Viewed

@@ -0,0 +1,182 @@
+# -*- coding: utf-8 -*-
+ITERABLE_TYPE_STREAM = 10
+ITERABLE_TYPE_FILE = 20
+ITERABLE_TYPE_CODEC = 30
+DEFAULT_BULK_NUMBER = 100
+import io
+import typing
+class BaseCodec:
+    """Basic codec class"""
+    def __init__(self, filename: str = None, fileobj: typing.IO = None, mode: str = 'r', open_it: bool = False, options:dict = {}):
+        self._fileobj = fileobj
+        self.filename = filename
+        self.mode = mode
+        if open_it:
+            self.open()
+        if len(options) > 0:
+            for k, v in options.items():
+                setattr(self, k, v)
+        pass
+    @staticmethod
+    def fileexts():
+        """Return file extensions"""
+        raise NotImplementedError
+    def reset(self):
+        """Reset file"""
+#        if self._fileobj.seekable():
+#            self._fileobj.seek(0)
+#        else:
+        self.close()
+        self.open()
+    def open(self):
+        raise NotImplementedError
+    def fileobj(self):
+        """Return file object"""
+        return self._fileobj
+    def close(self):
+        """Close codec. Not implemented by default"""
+        raise NotImplementedError
+    def textIO(self, encoding:str = 'utf8'):
+        """Return text wrapper over binary stream"""
+        return io.TextIOWrapper(self.fileobj(), encoding=encoding, write_through=True)
+class BaseIterable:
+    """Base iterable data class"""
+    def __init__(self):
+        pass
+    def reset(self):
+        """Reset iterator"""
+        raise NotImplementedError
+    @staticmethod
+    def id():
+        """Identifier of selected destination"""
+        raise NotImplementedError
+    @staticmethod
+    def has_totals():
+        """Has totals. Default: False"""
+        return False
+    def read(self, skip_empty:bool = True):
+        """Read single record"""
+        raise NotImplementedError
+    def read_bulk(self, num:int = DEFAULT_BULK_NUMBER):
+        """Read multiple records"""
+        raise NotImplementedError
+    @staticmethod
+    def is_flatonly():
+        """Is source flat by only. Default: False"""
+        return False
+    def is_flat(self):
+        """Is source flat. Default: """
+        if self.__class__().is_flatonly():
+            return True
+        raise NotImplementedError
+    def is_streaming(self):
+        """Is source streaming. Default: False"""
+        return False
+    def __next__(self):
+        return self.read()
+    def __iter__(self):
+#        self.reset()
+        return self
+    def write(self,  record: dict):
+        """Write single record"""
+        raise NotImplementedError
+    def write_bulk(self,  records: list[dict]):
+        """Write multiple records"""
+        raise NotImplementedError
+class BaseFileIterable(BaseIterable):
+    """Basic file iterable"""
+    datamode = 'text'
+    def __init__(self, filename:str = None, stream:typing.IO = None, codec: BaseCodec = None, binary:bool = False, encoding:str = 'utf8', noopen:bool = False, mode:str = 'r', options:dict = {}):
+        """Init basic file iterable"""
+        self.filename = filename
+        self.noopen = noopen
+        self.encoding = encoding
+        self.binary = binary
+        self.mode = mode
+        self.codec = codec
+        if stream is not None:
+            self.stype = ITERABLE_TYPE_STREAM
+        elif filename is not None:
+            self.stype = ITERABLE_TYPE_FILE
+        elif codec is not None:
+            self.stype = ITERABLE_TYPE_CODEC
+        self.fobj = None
+        if self.stype == ITERABLE_TYPE_FILE:
+            if not noopen:
+                self.open()
+        elif self.stype == ITERABLE_TYPE_STREAM:
+            self.fobj = stream
+        elif self.stype == ITERABLE_TYPE_CODEC:
+            if not noopen:
+                self.fobj = self.codec.open()
+                if self.datamode == 'text':
+                    self.fobj = self.codec.textIO(encoding=self.encoding)
+        if len(options) > 0:
+            for k, v in options.items():
+                setattr(self, k, v)
+    def open(self):
+        """Open file as file data source"""
+        if self.stype ==  ITERABLE_TYPE_FILE:
+            self.fobj = open(self.filename, self.mode + 'b') if self.binary else open(self.filename, self.mode, encoding=self.encoding)
+            return self.fobj
+        else:
+            raise NotImplementedError
+    def reset(self):
+        """Reset file using seek(0)"""
+        if self.stype == ITERABLE_TYPE_FILE:
+            if self.fobj is not None:
+                self.fobj.seek(0)
+        elif self.stype == ITERABLE_TYPE_CODEC:
+            if self.fobj is not None and self.mode not in ['w', 'wb']:
+                self.codec.reset()
+                self.fobj = self.codec.fileobj()
+                if self.datamode == 'text':
+                    self.fobj = self.codec.textIO(encoding=self.encoding)
+#                if self.fobj.seekable():
+ #                   self.fobj.seek(0)
+    def close(self):
+        """Close file as file data source"""
+        if self.stype == ITERABLE_TYPE_FILE:
+            if self.fobj is not None:
+                self.fobj.close()
+        elif self.stype == ITERABLE_TYPE_CODEC:
+            if self.codec is not None:
+                self.codec.close()

iterable/codecs/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .bz2codec import BZIP2Codec
+from .gzipcodec import GZIPCodec
+from .lzmacodec import LZMACodec
+from .lz4codec import LZ4Codec
+from .zipcodec import ZIPCodec
+from .zstdcodec import ZSTDCodec
+from .brotlicodec import BrotliCodec

iterable/codecs/brotlicodec.py ADDED Viewed

@@ -0,0 +1,28 @@
+from __future__ import annotations
+import typing
+from ..base import BaseCodec
+import brotli_file
+BROTLI_DEFAULT_COMPRESSION_LEVEL = 11
+class BrotliCodec(BaseCodec):
+    def __init__(self, filename:str, compression_level:int = BROTLI_DEFAULT_COMPRESSION_LEVEL, mode:str = 'r', open_it:bool = False, options:dict = {}):
+        "Code to support Brotli compression"
+        self.compression_level = compression_level
+        super(BrotliCodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
+    def open(self) -> brotli_file.BrotliFile:
+        self._fileobj = brotli_file.open(self.filename, mode=self.mode, quality=self.compression_level)
+        return self._fileobj
+    def close(self):
+        self._fileobj.close()
+    @staticmethod
+    def id():
+        return 'brotli'
+    @staticmethod
+    def fileexts() -> list[str]:
+        return ['br', 'brotli']

iterable/codecs/bz2codec.py ADDED Viewed

@@ -0,0 +1,21 @@
+from __future__ import annotations
+import typing
+from ..base import BaseCodec
+import bz2
+class BZIP2Codec(BaseCodec):
+    def __init__(self, filename:str, compression_level:int = 5, mode:str = 'r', open_it:bool = False, options:dict = {}):
+        self.compression_level = compression_level
+        super(BZIP2Codec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
+    def open(self) -> bz2.BZ2File:
+        self._fileobj = bz2.open(self.filename, self.mode, compresslevel=self.compression_level)
+        return self._fileobj
+    def close(self):
+        self._fileobj.close()
+    @staticmethod
+    def fileexts() -> list[str]:
+        return ['bz2',]

iterable/codecs/gzipcodec.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+import typing
+from ..base import BaseCodec
+import gzip
+class GZIPCodec(BaseCodec):
+    def __init__(self, filename:str, compression_level:int = 5, mode:str = 'r', open_it:bool = False, options:dict = {}):
+        self.compression_level = compression_level
+        super(GZIPCodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
+    def open(self) -> gzip.GzipFile:
+        self._fileobj = gzip.GzipFile(filename=self.filename, mode=self.mode, compresslevel=self.compression_level)
+        return self._fileobj
+    def close(self):
+        self._fileobj.close()
+    @staticmethod
+    def id():
+        return 'gzip'
+    @staticmethod
+    def fileexts() -> list[str]:
+        return ['gz',]

iterable/codecs/lz4codec.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+import typing
+from ..base import BaseCodec
+import lz4.frame
+class LZ4Codec(BaseCodec):
+    def __init__(self, filename:str, compression_level:int = lz4.frame.COMPRESSIONLEVEL_MINHC, mode:str = 'r', open_it:bool = False, options:dict = {}):
+        self.compression_level = compression_level
+        super(LZ4Codec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
+    def open(self) -> lz4.frame.LZ4FrameFile:
+        self._fileobj = lz4.frame.open(self.filename, mode=self.mode, compression_level=self.compression_level)
+        return self._fileobj
+    def close(self):
+        self._fileobj.close()
+    @staticmethod
+    def id():
+        return 'lz4'
+    @staticmethod
+    def fileexts() -> list[str]:
+        return ['lz4',]

iterable/codecs/lzmacodec.py ADDED Viewed

@@ -0,0 +1,39 @@
+from __future__ import annotations
+import typing
+from ..base import BaseCodec
+import lzma
+LZMA_FILTERS = [
+    {"id": lzma.FILTER_DELTA, "dist": 5},
+    {"id": lzma.FILTER_LZMA2, "preset": 7 | lzma.PRESET_EXTREME},
+]
+class LZMACodec(BaseCodec):
+    def __init__(self, filename:str, compression_level:int = 5, mode:str = 'r', open_it:bool = False, options:dict = {}):
+        self.compression_level = compression_level
+        super(LZMACodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
+    def open(self) -> lzma.LZMAFile:
+        filters = LZMA_FILTERS
+        filters[0]['dist'] = self.compression_level
+        self._fileobj = lzma.LZMAFile(self.filename, mode=self.mode, format=lzma.FORMAT_XZ)#, filters=filters)
+        return self._fileobj
+    def reset(self):
+        if self.mode in ['w', 'wb']:
+            pass
+        else:
+            super(LZMACodec, self).reset()
+    def close(self):
+        self._fileobj.close()
+    @staticmethod
+    def id():
+        return 'xz'
+    @staticmethod
+    def fileexts() -> list[str]:
+        return ['xz', 'lzma']

iterable/codecs/rawcodec.py ADDED Viewed

@@ -0,0 +1,23 @@
+from __future__ import annotations
+import typing
+from ..base import BaseCodec
+class RAWCodec(BaseCodec):
+    def __init__(self, filename:str, mode:str = 'r', open_it:bool = False, options:dict = {}):
+        super(RAWCodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
+    def open(self) -> typing.IO:
+        self._fileobj = open(self.filename, self.mode)
+        return self._fileobj
+    def close(self):
+        self._fileobj.close()
+    @staticmethod
+    def id():
+        return 'raw'
+    @staticmethod
+    def fileexts() -> list[str]:
+        return None

iterable/codecs/zipcodec.py ADDED Viewed

@@ -0,0 +1,34 @@
+from __future__ import annotations
+import typing
+from ..base import BaseCodec
+import zipfile
+class ZIPCodec(BaseCodec):
+    def __init__(self, filename:str, compression_level:int = 5, mode:str = 'r', open_it:bool = False, options:dict={}):
+        self.compression_level = compression_level
+        if mode == 'rb':
+            mode = 'r'
+            self.filemode = 'rb'
+        else:
+            self.filemode = 'r'
+        super(ZIPCodec, self).__init__(filename, mode=mode, open_it=open_it, options=options)
+    def open(self) -> zipfile.ZipFile:
+        self._archiveobj = zipfile.ZipFile(self.filename, mode=self.mode)
+        fnames = self._archiveobj.namelist()
+        self._fileobj = self._archiveobj.open(fnames[0], self.filemode)
+        return self._fileobj
+    def close(self):
+        self._fileobj.close()
+        self._archiveobj.close()
+    @staticmethod
+    def id():
+        return 'zip'
+    @staticmethod
+    def fileexts() -> list[str]:
+        return ['zip',]

iterable/codecs/zstdcodec.py ADDED Viewed

@@ -0,0 +1,34 @@
+from __future__ import annotations
+import typing
+from ..base import BaseCodec
+import zstandard as zstd
+# Zstandard file object doesn't support file seek so reset rewritten to close and open file
+class ZSTDCodec(BaseCodec):
+    def __init__(self, filename:str, compression_level:int = 0, mode:str = 'rb', open_it:bool = False, options:dict={}):
+        self.compression_level = compression_level
+        rmode = 'rb' if mode in ['r', 'rb'] else 'wb'
+        super(ZSTDCodec, self).__init__(filename, mode=rmode , open_it=open_it, options=options)
+    def open(self) -> zstd.ZstdDecompressionReader:
+        self._fileobj = zstd.open(self.filename, mode=self.mode)
+        return self._fileobj
+    def reset(self):
+        self.close()
+        self._fileobj = self.open()
+    def close(self):
+        self._fileobj.close()
+    @staticmethod
+    def id():
+        return 'zst'
+    @staticmethod
+    def fileexts() -> list[str]:
+        return ['zstd', 'zst']

iterable/convert/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .core import convert

iterable/convert/core.py ADDED Viewed

@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+from ..helpers.detect import open_iterable, is_flat
+from ..helpers.utils import dict_generator, make_flat
+from tqdm import tqdm
+import logging
+import time
+DEFAULT_BATCH_SIZE = 50000
+DEFAULT_HEADERS_DETECT_LIMIT = 1000
+ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
+DEFAULT_BATCH_SIZE = 50000
+def convert(fromfile:str, tofile:str, iterableargs:dict={}, scan_limit:int=DEFAULT_HEADERS_DETECT_LIMIT, batch_size:int=DEFAULT_BATCH_SIZE, silent:bool=True, is_flatten:bool=False, use_totals:bool=False):
+    it_in = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
+    keys = []
+    n = 0
+    it = tqdm(it_in, total=scan_limit, desc='Schema analysis') if not silent else it_in
+    is_flat_output = is_flat(tofile)
+    if is_flat_output:
+        if not silent: logging.debug('Extracting schema')
+        for item in it:
+            if scan_limit is not None and n > scan_limit:
+                break
+            n += 1
+            if not is_flatten:
+                dk = dict_generator(item)
+                for i in dk:
+                    k = ".".join(i[:-1])
+                    if k not in keys:
+                        keys.append(k)
+            else:
+                item = make_flat(item)
+                for k in item.keys():
+                    if k not in keys:
+                        keys.append(k)
+            it_in.reset()
+    if is_flat_output:
+        args = {'keys' : keys}
+    else:
+        args = {}
+    it_out = open_iterable(tofile, mode='w', iterableargs=args)
+    logging.debug('Converting data')
+    n = 0
+    if use_totals and it_in.has_totals():
+        totals = it_in.totals()
+        logging.debug(f'Total rows: {totals}')
+        it_in.reset()
+        it = tqdm(it_in, total=totals, desc='Converting') if not silent else it_in
+    else:
+        it = tqdm(it_in, desc='Converting') if not silent else it_in
+    batch = []
+    for row in it:
+        n += 1
+        if is_flatten:
+            for k in keys:
+                if k not in row.keys():
+                    row[k] = None
+            batch.append(make_flat(row))
+        else:
+            batch.append(row)
+        if n % batch_size == 0:
+            it_out.write_bulk(batch)
+            batch = []
+    if len(batch) > 0:
+        it_out.write_bulk(batch)
+    it_in.close()
+    it_out.close()

iterable/datatypes/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .avro import AVROIterable
+from .bsonf import BSONIterable
+from .csv import CSVIterable
+from .orc import ORCIterable
+from .parquet import ParquetIterable
+from .picklef import PickleIterable
+from .json import JSONIterable
+from .jsonl import JSONLinesIterable
+from .xls import XLSIterable
+from .xlsx import XLSXIterable
+from .xml import XMLIterable