PyPI - undatum - Versions diffs - 1.0.17__py2.py3-none-any.whl - Mend

undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

undatum/__init__.py +9 -0
undatum/__main__.py +25 -0
undatum/ai/__init__.py +145 -0
undatum/ai/base.py +85 -0
undatum/ai/config.py +184 -0
undatum/ai/perplexity.py +79 -0
undatum/ai/providers.py +1002 -0
undatum/ai/schemas.py +42 -0
undatum/cmds/__init__.py +6 -0
undatum/cmds/analyzer.py +697 -0
undatum/cmds/converter.py +646 -0
undatum/cmds/ingester.py +116 -0
undatum/cmds/query.py +68 -0
undatum/cmds/schemer.py +328 -0
undatum/cmds/selector.py +437 -0
undatum/cmds/statistics.py +158 -0
undatum/cmds/textproc.py +59 -0
undatum/cmds/transformer.py +81 -0
undatum/cmds/validator.py +137 -0
undatum/common/__init__.py +6 -0
undatum/common/functions.py +81 -0
undatum/common/iterable.py +222 -0
undatum/common/scheme.py +261 -0
undatum/constants.py +21 -0
undatum/core.py +616 -0
undatum/formats/__init__.py +6 -0
undatum/formats/docx.py +160 -0
undatum/utils.py +298 -0
undatum/validate/__init__.py +11 -0
undatum/validate/commonrules.py +15 -0
undatum/validate/ruscodes.py +202 -0
undatum-1.0.17.dist-info/METADATA +610 -0
undatum-1.0.17.dist-info/RECORD +37 -0
undatum-1.0.17.dist-info/WHEEL +6 -0
undatum-1.0.17.dist-info/entry_points.txt +3 -0
undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
undatum-1.0.17.dist-info/top_level.txt +1 -0

undatum/cmds/transformer.py ADDED Viewed

@@ -0,0 +1,81 @@
+# -*- coding: utf8 -*-
+"""Data transformation module."""
+import sys
+import orjson
+import logging
+#from xmlr import xmliter
+from ..utils import get_option, dict_generator
+from runpy import run_path
+from iterable.helpers.detect import open_iterable
+ITERABLE_OPTIONS_KEYS = ['tagname', 'delimiter', 'encoding', 'start_line', 'page']
+def get_iterable_options(options):
+    """Extract iterable-specific options from options dictionary."""
+    out = {}
+    for k in ITERABLE_OPTIONS_KEYS:
+        if k in options.keys():
+            out[k] = options[k]
+    return out
+DEFAULT_HEADERS_DETECT_LIMIT = 1000
+class Transformer:
+    """Data transformation handler."""
+    def __init__(self):
+        pass
+    def script(self, fromfile, options={}):
+        """Run certain script against selected file"""
+        script = run_path(options['script'])
+        __process_func = script['process']
+        iterableargs = get_iterable_options(options)
+        read_iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
+        limit = DEFAULT_HEADERS_DETECT_LIMIT
+        keys_set = set()  # Use set for O(1) lookup instead of O(n) list operations
+        n = 0
+        for item in read_iterable:
+            if limit and n > limit:
+                break
+            item = __process_func(item)
+            n += 1
+            dk = dict_generator(item)
+            for i in dk:
+                k = ".".join(i[:-1])
+                keys_set.add(k)
+        keys = list(keys_set)  # Convert to list for backward compatibility
+        read_iterable.close()
+        read_iterable = open_iterable(fromfile, mode='r', iterableargs=iterableargs)
+        write_to_iterable = False
+        to_file = get_option(options, 'output')
+        if to_file:
+            write_to_iterable = True
+            write_iterable = open_iterable(to_file, mode='w', iterableargs={'keys' : keys})
+        n = 0
+        for r in read_iterable:
+            n += 1
+            if n % 10000 == 0:
+                logging.info('apply script: processing %d records of %s' % (n, fromfile))
+            item = __process_func(r)
+            if write_to_iterable:
+                write_iterable.write(item)
+            else:
+                sys.stdout.write(orjson.dumps(item, option=orjson.OPT_APPEND_NEWLINE).decode('utf8'))
+        logging.debug('select: %d records processed' % (n))
+        read_iterable.close()
+        if write_to_iterable:
+            write_iterable.close()

undatum/cmds/validator.py ADDED Viewed

@@ -0,0 +1,137 @@
+# -*- coding: utf8 -*-
+"""Data validation module."""
+import csv
+import logging
+import sys
+import zipfile
+import bson
+import dictquery as dq
+import orjson
+from ..utils import get_file_type, get_option, get_dict_value
+from ..validate import VALIDATION_RULEMAP
+class Validator:
+    """Data validation handler."""
+    def __init__(self):
+        pass
+    def validate(self, fromfile, options=None):
+        """Validates selected field against validation rule."""
+        if options is None:
+            options = {}
+        logging.debug('Processing %s', fromfile)
+        f_type = get_file_type(fromfile) if options['format_in'] is None else options['format_in']
+        if options['zipfile']:
+            z = zipfile.ZipFile(fromfile, mode='r')
+            fnames = z.namelist()
+            if f_type == 'bson':
+                infile = z.open(fnames[0], 'rb')
+            else:
+                infile = z.open(fnames[0], 'r')
+        else:
+            if f_type == 'bson':
+                infile = open(fromfile, 'rb')
+            else:
+                infile = open(fromfile, 'r', encoding=get_option(options, 'encoding'))
+        to_file = get_option(options, 'output')
+        if to_file:
+            get_file_type(to_file)
+            if not to_file:
+                logging.debug('Output file type not supported')
+                return
+            out = open(to_file, 'w', encoding='utf8')
+        else:
+            out = sys.stdout
+        fields = options['fields'].split(',')
+        val_func = VALIDATION_RULEMAP[options['rule']]
+        logging.info('uniq: looking for fields: %s', options['fields'])
+        validated = []
+        stats = {'total': 0, 'invalid': 0, 'novalue' : 0}
+        if f_type == 'csv':
+            delimiter = get_option(options, 'delimiter')
+            reader = csv.DictReader(infile, delimiter=delimiter)
+            n = 0
+            for r in reader:
+                n += 1
+                if n % 1000 == 0:
+                    logging.info('uniq: processing %d records of %s', n, fromfile)
+                if options['filter'] is not None:
+                    if not dq.match(r, options['filter']):
+                        continue
+                res = val_func(r[fields[0]])
+                stats['total'] += 1
+                if not res:
+                    stats['invalid'] += 1
+                validated.append({fields[0] : r[fields[0]], fields[0] + '_valid' : res})
+        elif f_type == 'jsonl':
+            n = 0
+            for l in infile:
+                n += 1
+                if n % 10000 == 0:
+                    logging.info('uniq: processing %d records of %s', n, fromfile)
+                r = orjson.loads(l)
+                if options['filter'] is not None:
+                    if not dq.match(r, options['filter']):
+                        continue
+                stats['total'] += 1
+                values = get_dict_value(r, fields[0].split('.'))
+                if len(values) > 0:
+                    res = val_func(values[0])
+                    if not res:
+                        stats['invalid'] += 1
+                    validated.append({fields[0] : values[0], fields[0] + '_valid' : res})
+                else:
+                    stats['novalue'] += 1
+        elif f_type == 'bson':
+            bson_iter = bson.decode_file_iter(infile)
+            n = 0
+            for r in bson_iter:
+                n += 1
+                if n % 1000 == 0:
+                    logging.info('uniq: processing %d records of %s', n, fromfile)
+                if options['filter'] is not None:
+                    if not dq.match(r, options['filter']):
+                        continue
+                stats['total'] += 1
+                values = get_dict_value(r, fields[0].split('.'))
+                if len(values) > 0:
+                    res = val_func(values[0])
+                    if not res:
+                        stats['invalid'] += 1
+                    validated.append({fields[0] : values[0], fields[0] + '_valid' : res})
+                else:
+                    stats['novalue'] += 1
+        else:
+            logging.error('Invalid filed format provided')
+            if not options['zipfile']:
+                infile.close()
+            return
+        if not options['zipfile']:
+            infile.close()
+        stats['share'] = 100.0 * stats['invalid'] / stats['total']
+        novalue_share = 100.0 * stats['novalue'] / stats['total']
+        logging.debug('validate: complete, %d records (%.2f%%) not valid and %d '
+                     '(%.2f%%) not found of %d against %s',
+                     stats['invalid'], stats['share'], stats['novalue'],
+                     novalue_share, stats['total'], options['rule'])
+        if options['mode'] != 'stats':
+            fieldnames = [fields[0], fields[0] + '_valid']
+            writer = csv.DictWriter(out, fieldnames=fieldnames,
+                                    delimiter=get_option(options, 'delimiter'))
+            for row in validated:
+                if options['mode'] == 'invalid':
+                    if not row[fields[0] + '_valid']:
+                        writer.writerow(row)
+                elif options['mode'] == 'all':
+                    writer.writerow(row)
+        else:
+            out.write(str(orjson.dumps(stats, option=orjson.OPT_INDENT_2)))
+        if to_file:
+            out.close()
+        if options['zipfile']:
+            z.close()

undatum/common/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# -*- coding: utf8 -*-
+"""Common utilities and helper functions for data processing.
+This module provides shared functionality used across the undatum package,
+including iterable data handling, schema management, and common functions.
+"""

undatum/common/functions.py ADDED Viewed

@@ -0,0 +1,81 @@
+# -*- coding: utf8 -*-
+"""Common utility functions for dictionary operations.
+This module provides helper functions for accessing nested dictionary
+values using dot-notation keys.
+"""
+def get_dict_value(adict, key, prefix=None):
+    """Get value from dictionary using dot-notation key.
+    Args:
+        adict: Dictionary to search.
+        key: Dot-separated key path (e.g., 'field.subfield').
+        prefix: Pre-split key parts (used internally for recursion).
+    Returns:
+        Value at the specified key path.
+    Raises:
+        KeyError: If any key in the path doesn't exist.
+    """
+    if prefix is None:
+        prefix = key.split('.')
+    if len(prefix) == 1:
+        return adict[prefix[0]]
+    return get_dict_value(adict[prefix[0]], key, prefix=prefix[1:])
+def get_dict_value_deep(adict, key, prefix=None, as_array=False, splitter='.'):
+    """Get value from hierarchical dictionaries with deep traversal.
+    Supports nested dictionaries and lists, with optional array collection
+    of values from multiple sources.
+    Args:
+        adict: Dictionary or list to search.
+        key: Dot-separated key path (e.g., 'field.subfield').
+        prefix: Pre-split key parts (used internally for recursion).
+        as_array: If True, collect all matching values into an array.
+        splitter: Character used to split key path (default: '.').
+    Returns:
+        Value at the specified key path, or list of values if as_array=True.
+        Returns None if key path not found.
+    """
+    if prefix is None:
+        prefix = key.split(splitter)
+    if len(prefix) == 1:
+        if isinstance(adict, dict):
+            if prefix[0] not in adict:
+                return None
+            if as_array:
+                return [adict[prefix[0]]]
+            return adict[prefix[0]]
+        if isinstance(adict, list):
+            if as_array:
+                result = []
+                for v in adict:
+                    if prefix[0] in v:
+                        result.append(v[prefix[0]])
+                return result
+            if len(adict) > 0 and prefix[0] in adict[0]:
+                return adict[0][prefix[0]]
+        return None
+    if isinstance(adict, dict):
+        if prefix[0] in adict:
+            return get_dict_value_deep(adict[prefix[0]], key, prefix=prefix[1:],
+                                       as_array=as_array)
+    elif isinstance(adict, list):
+        if as_array:
+            result = []
+            for v in adict:
+                res = get_dict_value_deep(v[prefix[0]], key, prefix=prefix[1:],
+                                          as_array=as_array)
+                if res:
+                    result.extend(res)
+            return result
+        return get_dict_value_deep(adict[0][prefix[0]], key, prefix=prefix[1:],
+                                    as_array=as_array)
+    return None

undatum/common/iterable.py ADDED Viewed

@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+"""Iterable data handling module."""
+from ..constants import BINARY_FILE_TYPES
+from ..utils import get_file_type, get_option, detect_encoding, detect_delimiter
+import csv
+import jsonlines
+import bson
+import logging
+import io
+SUPPORTED_COMPRESSION = {'gz': True, 'zip': True, 'xz': False, '7z': False,  'lz4': False, 'bz2' : True}
+import gzip
+from zipfile import ZipFile
+from lzma import LZMAFile
+from bz2 import BZ2File
+try:
+    import lz4
+    SUPPORTED_COMPRESSION['lz4'] = True
+except ImportError:
+    pass
+try:
+    import py7zr
+    SUPPORTED_COMPRESSION['7z'] = True
+except ImportError:
+    pass
+DEFAULT_ENCODING = 'utf8'
+DEFAULT_DELIMITER = ','
+class IterableData:
+    """Iterable data reader (CSV/JSON lines, BSON"""
+    def __init__(self, filename, options={}, autodetect=True, autodetect_limit=100000):
+        """Creates iterable object from CSV, JSON lines and other iterable files.
+        """
+        self.autodetect = autodetect
+        self.autodetect_limit = autodetect_limit
+        self.options = options
+        self.archiveobj = None
+        self.fileobj = None
+        self.binary = False
+        self.delimiter = get_option(options, 'delimiter')
+        self.init(filename, options)
+        pass
+    def init(self, filename, options):
+        f_type = get_file_type(filename) if options['format_in'] is None else options['format_in']
+        self.encoding = get_option(options, 'encoding')
+        self.filetype = f_type
+        ext = filename.rsplit('.', 1)[-1].lower()
+        self.ext = ext
+        if ext in SUPPORTED_COMPRESSION.keys():
+            self.binary = True
+            self.mode = 'rb' if self.filetype in BINARY_FILE_TYPES else 'r'
+            if ext == 'gz':
+                self.fileobj = gzip.open(filename, self.mode)
+            elif ext == 'bz2':
+                self.fileobj = BZ2File(filename, self.mode)
+            elif ext == 'xz':
+                self.fileobj = LZMAFile(filename, self.mode)
+            elif ext == 'zip':
+                self.archiveobj = ZipFile(filename, mode='r')
+                fnames = self.archiveobj.namelist()
+                self.fileobj = self.archiveobj.open(fnames[0], self.mode)
+            else:
+                raise NotImplementedError
+        else:
+            if f_type in BINARY_FILE_TYPES:
+                self.fileobj = open(filename, 'rb')
+            else:
+                if 'encoding' in options.keys() and options['encoding']:
+                    encoding = get_option(options, 'encoding')
+                else:
+                    if self.autodetect:
+                        detected_enc = detect_encoding(filename, limit=self.autodetect_limit)
+                        encoding = detected_enc['encoding'] if detected_enc else DEFAULT_ENCODING
+                        if f_type == 'csv':
+                            detected_del = detect_delimiter(filename, encoding)
+                            delimiter = detected_del if detected_del else DEFAULT_DELIMITER
+                            self.delimiter = delimiter
+                    else:
+                        encoding = DEFAULT_ENCODING
+                        if f_type == 'csv':
+                            delimiter = DEFAULT_DELIMITER
+                            self.delimiter = delimiter
+                    logging.debug('Detected encoding %s' % (detected_enc['encoding']))
+                self.encoding = encoding
+                self.fileobj = open(filename, 'r', encoding=encoding)
+    def init_orig(self, filename, options):
+        f_type = get_file_type(filename) if options['format_in'] is None else options['format_in']
+        encoding = get_option(options, 'encoding')
+        self.filetype = f_type
+        if options['zipfile']:
+            z = ZipFile(filename, mode='r')
+            fnames = z.namelist()
+            if f_type in BINARY_FILE_TYPES:
+                self.fileobj = z.open(fnames[0], 'rb')
+            else:
+                self.fileobj = z.open(fnames[0], 'r')
+        else:
+            if f_type in BINARY_FILE_TYPES:
+                self.fileobj = open(filename, 'rb')
+            else:
+                if 'encoding' in options.keys() and options['encoding']:
+                    encoding = get_option(options, 'encoding')
+                else:
+                    if self.autodetect:
+                        detected_enc = detect_encoding(filename, limit=self.autodetect_limit)
+                        encoding = detected_enc['encoding'] if detected_enc else DEFAULT_ENCODING
+                        if f_type == 'csv':
+                            detected_del = detect_delimiter(filename, encoding)
+                            delimiter = detected_del if detected_del else DEFAULT_DELIMITER
+                            self.delimiter = delimiter
+                    else:
+                        encoding = DEFAULT_ENCODING
+                        if f_type == 'csv':
+                            delimiter = DEFAULT_DELIMITER
+                    logging.debug('Detected encoding %s' % (detected_enc['encoding']))
+                self.encoding = encoding
+                self.fileobj = open(filename, 'r', encoding=encoding)
+    def iter(self):
+        if self.filetype == 'csv':
+            if self.binary:
+                obj = io.TextIOWrapper(self.fileobj, encoding=self.encoding)
+                reader = csv.DictReader(obj, delimiter=self.delimiter)
+            else:
+                reader = csv.DictReader(self.fileobj, delimiter=self.delimiter)
+            return iter(reader)
+        elif self.filetype == 'jsonl':
+            return jsonlines.Reader(self.fileobj)
+        elif self.filetype == 'bson':
+            return bson.decode_file_iter(self.fileobj)
+    def close(self):
+        """Closes file object and archive file object if it exists"""
+        if self.fileobj is not None:
+            self.fileobj.close()
+        if self.archiveobj is not None:
+            self.archiveobj.close()
+class BSONWriter:
+    """BSON file writer."""
+    def __init__(self, fileobj):
+        self.fo = fileobj
+    def write(self, item):
+        rec = bson.BSON.encode(item)
+        self.fo.write(rec)
+class DataWriter:
+    """Data writer (CSV/JSON lines, BSON"""
+    def __init__(self, fileobj, filetype, output_type:str='iterable', delimiter:str=',', fieldnames:list=None):
+        """Creates iterable object from CSV, JSON lines or BSON file.
+        """
+        self.output_type = output_type
+        self.filetype = filetype
+        self.fieldnames = fieldnames
+        self.fileobj = fileobj
+        if self.filetype == 'csv':
+            self.writer = csv.DictWriter(self.fileobj, delimiter=delimiter, fieldnames=fieldnames)
+        elif self.filetype == 'jsonl':
+            self.writer = jsonlines.Writer(self.fileobj)
+        elif self.filetype == 'bson':
+            self.writer = BSONWriter(self.fileobj)
+        pass
+#    def write_item(self, row):
+    def write_items(self, outdata):
+        if len(outdata) == 0:
+            return
+        if self.filetype == 'csv':
+            self.writer.writeheader()
+            if isinstance(outdata[0], str):
+                for rawitem in outdata:
+                    item = {self.fieldnames[0]: rawitem}
+                    self.writer.writerow(item)
+            elif isinstance(outdata[0], list) or isinstance(outdata[0], tuple):
+                for rawitem in outdata:
+                    item = dict(zip(self.fieldnames, rawitem))
+                    self.writer.writerow(item)
+            else:
+                self.writer.writerows(outdata)
+        elif self.filetype in ['jsonl', 'bson']:
+            # If our data is just array of strings, we just transform it to dict
+            if isinstance(outdata[0], str):
+                for rawitem in outdata:
+                    item = {self.fieldnames[0]: rawitem}
+                    self.writer.write(item)
+#                    handle.write(orjson.dumps(item, option=orjson.OPT_APPEND_NEWLINE).decode('utf8'))
+            elif isinstance(outdata[0], list) or isinstance(outdata[0], tuple):
+                for rawitem in outdata:
+                    item = dict(zip(self.fieldnames, rawitem))
+                    self.writer.write(item)
+#                    handle.write(orjson.dumps(item, option=orjson.OPT_APPEND_NEWLINE).decode('utf8'))
+            else:
+                if self.output_type == 'iterable':
+                    for item in outdata:
+                        self.writer.write(item)
+                elif self.output_type == 'duckdb':
+                    item = dict(zip(self.fieldnames, rawitem))
+                    self.writer.write(item)
+#                    handle.write(orjson.dumps(item, option=orjson.OPT_APPEND_NEWLINE).decode('utf8'))
+if __name__ == "__main__":
+    f = open('outtest.jsonl', 'w')
+    writer = DataWriter(f, filetype='jsonl', fieldnames=['name', 'value'])
+    writer.write_items([{'name' : 'Cat', 'value' : 15}])
+    pass