PyPI - undatum - Versions diffs - 1.0.17__py2.py3-none-any.whl - Mend

undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

undatum/__init__.py +9 -0
undatum/__main__.py +25 -0
undatum/ai/__init__.py +145 -0
undatum/ai/base.py +85 -0
undatum/ai/config.py +184 -0
undatum/ai/perplexity.py +79 -0
undatum/ai/providers.py +1002 -0
undatum/ai/schemas.py +42 -0
undatum/cmds/__init__.py +6 -0
undatum/cmds/analyzer.py +697 -0
undatum/cmds/converter.py +646 -0
undatum/cmds/ingester.py +116 -0
undatum/cmds/query.py +68 -0
undatum/cmds/schemer.py +328 -0
undatum/cmds/selector.py +437 -0
undatum/cmds/statistics.py +158 -0
undatum/cmds/textproc.py +59 -0
undatum/cmds/transformer.py +81 -0
undatum/cmds/validator.py +137 -0
undatum/common/__init__.py +6 -0
undatum/common/functions.py +81 -0
undatum/common/iterable.py +222 -0
undatum/common/scheme.py +261 -0
undatum/constants.py +21 -0
undatum/core.py +616 -0
undatum/formats/__init__.py +6 -0
undatum/formats/docx.py +160 -0
undatum/utils.py +298 -0
undatum/validate/__init__.py +11 -0
undatum/validate/commonrules.py +15 -0
undatum/validate/ruscodes.py +202 -0
undatum-1.0.17.dist-info/METADATA +610 -0
undatum-1.0.17.dist-info/RECORD +37 -0
undatum-1.0.17.dist-info/WHEEL +6 -0
undatum-1.0.17.dist-info/entry_points.txt +3 -0
undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
undatum-1.0.17.dist-info/top_level.txt +1 -0

undatum/formats/docx.py ADDED Viewed

@@ -0,0 +1,160 @@
+# -*- coding: utf8 -*-
+"""DOCX file format handling and table extraction."""
+import csv
+import datetime
+import json
+import openpyxl
+import xlwt
+from docx import Document
+from docx.oxml.simpletypes import ST_Merge
+from docx.table import _Cell
+def __extract_table(table, strip_space=False):
+    """Extracts table data from table object"""
+    results = []
+    n = 0
+    for tr in table._tbl.tr_lst:
+        r = []
+        for tc in tr.tc_lst:
+            for grid_span_idx in range(tc.grid_span):
+                if tc.vMerge == ST_Merge.CONTINUE:
+                    value = results[n - 1][len(r) - 1]
+                elif grid_span_idx > 0:
+                    value = r[-1]
+                else:
+                    cell = _Cell(tc, table)
+                    value = cell.text.replace("\n", " ")
+                if strip_space:
+                    value = value.strip()
+                r.append(value)
+        results.append(r)
+#        print(r)
+        n += 1
+    return results
+def __store_table(tabdata, filename, output_format="csv"):
+    """Saves table data as csv file."""
+    if output_format == "csv":
+        with open(filename, "w", encoding='utf8') as f:
+            w = csv.writer(f, delimiter=",")
+            for row in tabdata:
+                w.writerow(row)
+    elif output_format == 'tsv':
+        with open(filename, 'w', encoding='utf8') as f:
+            w = csv.writer(f, delimiter='\t')
+            for row in tabdata:
+                w.writerow(row)
+    elif output_format == 'xls':
+        workbook = xlwt.Workbook()
+        __xls_table_to_sheet(tabdata, workbook.add_sheet("0"))
+        workbook.save(filename)
+    elif output_format == "xlsx":
+        workbook = openpyxl.Workbook()
+        __xlsx_table_to_sheet(tabdata, workbook.create_sheet("0"))
+        workbook.save(filename)
+def __xls_table_to_sheet(table, ws):
+    rn = 0
+    for row in table:
+        cn = 0
+        for c in row:
+            ws.write(rn, cn, c)
+            cn += 1
+        rn += 1
+    return ws
+def __xlsx_table_to_sheet(table, ws):
+    rn = 0
+    for row in table:
+        ws.append(row)
+        rn += 1
+    return ws
+def extract_docx_tables(filename, strip_space=True):
+    """Extracts table from .DOCX files"""
+    tables = []
+    document = Document(filename)
+    n = 0
+    for table in document.tables:
+        n += 1
+        info = {}
+        info['id'] = n
+        info['num_cols'] = len(table.columns)
+        info['num_rows'] = len(table.rows)
+        info['style'] = table.style.name
+        tdata = __extract_table(table, strip_space=strip_space)
+        info['data'] = tdata
+        tables.append(info)
+    return tables
+def extract(filename, output_format="csv", sizefilter=0, singlefile=False,
+           output=None, strip_space=True):
+    """Extracts tables from csv files and saves them as csv, xls or xlsx files."""
+    tables = extract_docx_tables(filename, strip_space=strip_space)
+    name = filename.rsplit(".", 1)[0]
+    output_format = output_format.lower()
+    n = 0
+    lfilter = int(sizefilter)
+    if singlefile:
+        if output_format == "xls":
+            workbook = xlwt.Workbook()
+            for t in tables:
+                if lfilter >= len(t):
+                    continue
+                n += 1
+                __xls_table_to_sheet(t['data'], workbook.add_sheet(str(n)))
+            destname = output if output else f"{name}.{output_format}"
+            workbook.save(destname)
+        elif output_format == "xlsx":
+            workbook = openpyxl.Workbook()
+            for t in tables:
+                if lfilter >= len(t):
+                    continue
+                n += 1
+                __xlsx_table_to_sheet(t['data'], workbook.create_sheet(str(n)))
+            destname = output if output else f"{name}.{output_format}"
+            workbook.save(destname)
+        elif output_format == "json":
+            report = {'filename': filename,
+                      'timestamp': datetime.datetime.now().isoformat(),
+                      'num_tables': len(tables),
+                      'tables': tables}
+            destname = output if output else f"{name}.{output_format}"
+            with open(destname, 'w', encoding='utf8') as f:
+                json.dump(report, f, ensure_ascii=False, indent=4)
+    else:
+        for t in tables:
+            if lfilter >= len(t):
+                continue
+            n += 1
+            destname = output if output else f"{name}_{n}.{output_format}"
+            __store_table(t['data'], destname, output_format)
+def analyze_docx(filename, extract_data=None, strip_space=True):
+    """Analyzes docx file and extracts data if requested."""
+    # extract_data parameter kept for API compatibility but not used
+    tableinfo = []
+    document = Document(filename)
+    n = 0
+    for table in document.tables:
+        n += 1
+        info = {}
+        info['id'] = n
+        info['num_cols'] = len(table.columns)
+        info['num_rows'] = len(table.rows)
+        info['style'] = table.style.name
+        tdata = __extract_table(table, strip_space=strip_space)
+        info['data'] = tdata
+        tableinfo.append(info)
+    return tableinfo

undatum/utils.py ADDED Viewed

@@ -0,0 +1,298 @@
+# -*- coding: utf8 -*-
+"""Utility functions for file operations and data processing.
+This module provides helper functions for encoding detection, delimiter detection,
+file type identification, dictionary manipulation, and data type guessing.
+"""
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional, Union
+import chardet
+from .constants import DEFAULT_OPTIONS, SUPPORTED_FILE_TYPES
+def detect_encoding(filename: str, limit: int = 1000000) -> Dict[str, Any]:
+    """Detect encoding of a file.
+    Args:
+        filename: Path to the file to analyze.
+        limit: Maximum number of bytes to read for detection (default: 1000000).
+    Returns:
+        Dictionary with encoding detection results from chardet.
+    """
+    with open(filename, 'rb') as f:
+        chunk = f.read(limit)
+    detected = chardet.detect(chunk)
+    return detected
+def detect_delimiter(filename: str, encoding: str = 'utf8') -> str:
+    """Detect delimiter used in a CSV-like file.
+    Args:
+        filename: Path to the CSV file to analyze.
+        encoding: File encoding (default: 'utf8').
+    Returns:
+        Most likely delimiter character (',', ';', '\\t', or '|').
+    """
+    with open(filename, 'r', encoding=encoding) as f:
+        line = f.readline()
+    dict1 = {',': line.count(','), ';': line.count(';'),
+             '\t': line.count('\t'), '|': line.count('|')}
+    delimiter = max(dict1, key=dict1.get)
+    return delimiter
+def get_file_type(filename: str) -> Optional[str]:
+    """Get file type based on extension.
+    Args:
+        filename: Path to the file.
+    Returns:
+        File extension if supported, None otherwise.
+    """
+    ext = filename.rsplit('.', 1)[-1].lower()
+    if ext in SUPPORTED_FILE_TYPES:
+        return ext
+    return None
+def get_option(options: Dict[str, Any], name: str) -> Any:
+    """Get option value from options dict or default options.
+    Args:
+        options: Dictionary of user-provided options.
+        name: Option name to retrieve.
+    Returns:
+        Option value if found, None otherwise.
+    """
+    if name in options:
+        return options[name]
+    if name in DEFAULT_OPTIONS:
+        return DEFAULT_OPTIONS[name]
+    return None
+def get_dict_value(d: Union[Dict[str, Any], List[Dict[str, Any]], None], keys: List[str]) -> List[Any]:
+    """Get dictionary value by nested keys.
+    Args:
+        d: Dictionary or list of dictionaries to search.
+        keys: List of nested keys to traverse.
+    Returns:
+        List of values found at the specified key path.
+    """
+    out = []
+    if d is None:
+        return out
+    if len(keys) == 1:
+        if isinstance(d, (dict, OrderedDict)):
+            if keys[0] in d:
+                out.append(d[keys[0]])
+        else:
+            for r in d:
+                if r and keys[0] in r:
+                    out.append(r[keys[0]])
+    else:
+        if isinstance(d, (dict, OrderedDict)):
+            if keys[0] in d:
+                out.extend(get_dict_value(d[keys[0]], keys[1:]))
+        else:
+            for r in d:
+                if keys[0] in r:
+                    out.extend(get_dict_value(r[keys[0]], keys[1:]))
+    return out
+def strip_dict_fields(record: Dict[str, Any], fields: List[List[str]], startkey: int = 0) -> Dict[str, Any]:
+    """Strip dictionary fields based on field list.
+    Args:
+        record: Dictionary to process.
+        fields: List of field paths (nested keys as lists).
+        startkey: Starting index for field path (default: 0).
+    Returns:
+        Modified dictionary with only specified fields retained.
+    """
+    # Create set for O(1) lookup instead of O(n) list lookup
+    localf = set()
+    for field in fields:
+        if len(field) > startkey:
+            localf.add(field[startkey])
+    # Iterate over copy of keys to avoid modification during iteration
+    keys = list(record.keys())
+    for k in keys:
+        if k not in localf:
+            del record[k]
+    for k in record:
+        if isinstance(record[k], dict):
+            record[k] = strip_dict_fields(record[k], fields, startkey + 1)
+    return record
+def dict_generator(indict: Union[Dict[str, Any], Any], pre: Optional[List[str]] = None):
+    """Process dictionary and yield flattened key-value pairs.
+    Recursively traverses nested dictionaries and lists, yielding
+    key paths with their values. Skips '_id' keys.
+    Args:
+        indict: Input dictionary to process.
+        pre: Prefix keys list for nested structures (default: None).
+    Yields:
+        Lists containing key path and value: [key1, key2, ..., value]
+    """
+    pre = pre[:] if pre else []
+    if isinstance(indict, dict):
+        for key, value in indict.items():  # Use dict.items() directly, no list() conversion
+            if key == "_id":
+                continue
+            if isinstance(value, dict):
+                yield from dict_generator(value, pre + [key])
+            elif isinstance(value, (list, tuple)):
+                for v in value:
+                    if isinstance(v, dict):
+                        yield from dict_generator(v, pre + [key])
+            else:
+                yield pre + [key, value]
+    else:
+        yield indict
+def guess_int_size(i: int) -> str:
+    """Guess appropriate integer size type based on value.
+    Args:
+        i: Integer value to analyze.
+    Returns:
+        String indicating size type: 'uint8', 'uint16', or 'uint32'.
+    """
+    if i < 255:
+        return 'uint8'
+    if i < 65535:
+        return 'uint16'
+    return 'uint32'
+def guess_datatype(s: Union[str, int, float, None], qd: Any) -> Dict[str, Any]:
+    """Guess data type of a string value.
+    Analyzes a string to determine if it represents an integer, float,
+    date, empty value, or remains a string.
+    Args:
+        s: Value to analyze (can be string, int, float, or None).
+        qd: Query date matcher object for date detection.
+    Returns:
+        Dictionary with 'base' key indicating detected type and optional
+        'subtype' or 'pat' keys for additional information.
+    """
+    attrs = {'base': 'str'}
+    if s is None:
+        return {'base': 'empty'}
+    if isinstance(s, int):
+        return {'base': 'int'}
+    if isinstance(s, float):
+        return {'base': 'float'}
+    if not isinstance(s, str):
+        return {'base': 'typed'}
+    if s.isdigit():
+        if s[0] == '0':
+            attrs = {'base': 'numstr'}
+        else:
+            attrs = {'base': 'int', 'subtype': guess_int_size(int(s))}
+    else:
+        try:
+            float(s)
+            attrs = {'base': 'float'}
+            return attrs
+        except ValueError:
+            pass
+        if qd:
+            is_date = False
+            res = qd.match(s)
+            if res:
+                attrs = {'base': 'date', 'pat': res['pattern']}
+                is_date = True
+            if not is_date:
+                if len(s.strip()) == 0:
+                    attrs = {'base': 'empty'}
+    return attrs
+def buf_count_newlines_gen(fname: str) -> int:
+    """Count newlines in a file using buffered reading.
+    Efficiently counts newline characters in large files by reading
+    in chunks rather than loading entire file into memory.
+    Args:
+        fname: Path to the file to analyze.
+    Returns:
+        Integer count of newline characters in the file.
+    """
+    def _make_gen(reader):
+        while True:
+            b = reader(2 ** 16)
+            if not b:
+                break
+            yield b
+    with open(fname, "rb") as f:
+        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
+    return count
+def get_dict_keys(iterable: Any, limit: int = 1000) -> List[str]:
+    """Get all unique dictionary keys from an iterable of dictionaries.
+    Extracts all nested keys from dictionaries, flattening them with dot notation.
+    Uses set for O(1) lookup performance instead of O(n) list operations.
+    Args:
+        iterable: Iterable of dictionaries to process.
+        limit: Maximum number of items to process (default: 1000).
+    Returns:
+        List of unique flattened key paths (e.g., ['field1', 'field2.subfield']).
+    """
+    n = 0
+    keys_set = set()  # Use set for O(1) lookup instead of O(n) list operations
+    for item in iterable:
+        if limit and n > limit:
+            break
+        n += 1
+        dk = dict_generator(item)
+        for i in dk:
+            k = ".".join(i[:-1])
+            keys_set.add(k)
+    return list(keys_set)  # Convert to list for backward compatibility
+def _is_flat(item: Dict[str, Any]) -> bool:
+    """Check if dictionary contains only flat (non-nested) values.
+    Args:
+        item: Dictionary to check.
+    Returns:
+        True if dictionary contains no nested structures, False otherwise.
+    """
+    for v in item.values():
+        if isinstance(v, (tuple, list)):
+            return False
+        if isinstance(v, dict):
+            if not _is_flat(v):
+                return False
+    return True

undatum/validate/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# -*- coding: utf8 -*-
+"""Validation rules module for undatum."""
+from .ruscodes import _check_inn, _check_ogrn
+from .commonrules import _validate_email, _validate_url
+VALIDATION_RULEMAP = {
+    'ru.org.ogrn': _check_ogrn,
+    'ru.org.inn': _check_inn,
+    'common.email': _validate_email,
+    'common.url': _validate_url
+}

undatum/validate/commonrules.py ADDED Viewed

@@ -0,0 +1,15 @@
+# -*- coding: utf8 -*-
+"""Common validation rules for email and URL."""
+from email.utils import parseaddr
+import validators.url
+def _validate_email(s):
+    """Validate email address."""
+    return '@' in parseaddr(s)[1]
+def _validate_url(s):
+    """Validate URL."""
+    r = validators.url(s)
+    return r is True