undatum 1.0.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,160 @@
1
+ # -*- coding: utf8 -*-
2
+ """DOCX file format handling and table extraction."""
3
+ import csv
4
+ import datetime
5
+ import json
6
+
7
+ import openpyxl
8
+ import xlwt
9
+ from docx import Document
10
+ from docx.oxml.simpletypes import ST_Merge
11
+ from docx.table import _Cell
12
+
13
+
14
+ def __extract_table(table, strip_space=False):
15
+ """Extracts table data from table object"""
16
+ results = []
17
+ n = 0
18
+ for tr in table._tbl.tr_lst:
19
+ r = []
20
+ for tc in tr.tc_lst:
21
+ for grid_span_idx in range(tc.grid_span):
22
+ if tc.vMerge == ST_Merge.CONTINUE:
23
+ value = results[n - 1][len(r) - 1]
24
+ elif grid_span_idx > 0:
25
+ value = r[-1]
26
+ else:
27
+ cell = _Cell(tc, table)
28
+ value = cell.text.replace("\n", " ")
29
+ if strip_space:
30
+ value = value.strip()
31
+ r.append(value)
32
+ results.append(r)
33
+ # print(r)
34
+ n += 1
35
+ return results
36
+
37
+
38
+ def __store_table(tabdata, filename, output_format="csv"):
39
+ """Saves table data as csv file."""
40
+ if output_format == "csv":
41
+ with open(filename, "w", encoding='utf8') as f:
42
+ w = csv.writer(f, delimiter=",")
43
+ for row in tabdata:
44
+ w.writerow(row)
45
+ elif output_format == 'tsv':
46
+ with open(filename, 'w', encoding='utf8') as f:
47
+ w = csv.writer(f, delimiter='\t')
48
+ for row in tabdata:
49
+ w.writerow(row)
50
+ elif output_format == 'xls':
51
+ workbook = xlwt.Workbook()
52
+ __xls_table_to_sheet(tabdata, workbook.add_sheet("0"))
53
+ workbook.save(filename)
54
+ elif output_format == "xlsx":
55
+ workbook = openpyxl.Workbook()
56
+ __xlsx_table_to_sheet(tabdata, workbook.create_sheet("0"))
57
+ workbook.save(filename)
58
+
59
+ def __xls_table_to_sheet(table, ws):
60
+ rn = 0
61
+ for row in table:
62
+ cn = 0
63
+ for c in row:
64
+ ws.write(rn, cn, c)
65
+ cn += 1
66
+ rn += 1
67
+ return ws
68
+
69
+
70
+ def __xlsx_table_to_sheet(table, ws):
71
+ rn = 0
72
+ for row in table:
73
+ ws.append(row)
74
+ rn += 1
75
+ return ws
76
+
77
+
78
+ def extract_docx_tables(filename, strip_space=True):
79
+ """Extracts table from .DOCX files"""
80
+ tables = []
81
+ document = Document(filename)
82
+ n = 0
83
+ for table in document.tables:
84
+ n += 1
85
+ info = {}
86
+ info['id'] = n
87
+ info['num_cols'] = len(table.columns)
88
+ info['num_rows'] = len(table.rows)
89
+ info['style'] = table.style.name
90
+ tdata = __extract_table(table, strip_space=strip_space)
91
+ info['data'] = tdata
92
+ tables.append(info)
93
+ return tables
94
+
95
+
96
+
97
+
98
+
99
+ def extract(filename, output_format="csv", sizefilter=0, singlefile=False,
100
+ output=None, strip_space=True):
101
+ """Extracts tables from csv files and saves them as csv, xls or xlsx files."""
102
+ tables = extract_docx_tables(filename, strip_space=strip_space)
103
+ name = filename.rsplit(".", 1)[0]
104
+ output_format = output_format.lower()
105
+ n = 0
106
+ lfilter = int(sizefilter)
107
+ if singlefile:
108
+ if output_format == "xls":
109
+ workbook = xlwt.Workbook()
110
+ for t in tables:
111
+ if lfilter >= len(t):
112
+ continue
113
+ n += 1
114
+ __xls_table_to_sheet(t['data'], workbook.add_sheet(str(n)))
115
+ destname = output if output else f"{name}.{output_format}"
116
+ workbook.save(destname)
117
+ elif output_format == "xlsx":
118
+ workbook = openpyxl.Workbook()
119
+ for t in tables:
120
+ if lfilter >= len(t):
121
+ continue
122
+ n += 1
123
+ __xlsx_table_to_sheet(t['data'], workbook.create_sheet(str(n)))
124
+ destname = output if output else f"{name}.{output_format}"
125
+ workbook.save(destname)
126
+ elif output_format == "json":
127
+ report = {'filename': filename,
128
+ 'timestamp': datetime.datetime.now().isoformat(),
129
+ 'num_tables': len(tables),
130
+ 'tables': tables}
131
+ destname = output if output else f"{name}.{output_format}"
132
+ with open(destname, 'w', encoding='utf8') as f:
133
+ json.dump(report, f, ensure_ascii=False, indent=4)
134
+
135
+ else:
136
+ for t in tables:
137
+ if lfilter >= len(t):
138
+ continue
139
+ n += 1
140
+ destname = output if output else f"{name}_{n}.{output_format}"
141
+ __store_table(t['data'], destname, output_format)
142
+
143
+
144
+ def analyze_docx(filename, extract_data=None, strip_space=True):
145
+ """Analyzes docx file and extracts data if requested."""
146
+ # extract_data parameter kept for API compatibility but not used
147
+ tableinfo = []
148
+ document = Document(filename)
149
+ n = 0
150
+ for table in document.tables:
151
+ n += 1
152
+ info = {}
153
+ info['id'] = n
154
+ info['num_cols'] = len(table.columns)
155
+ info['num_rows'] = len(table.rows)
156
+ info['style'] = table.style.name
157
+ tdata = __extract_table(table, strip_space=strip_space)
158
+ info['data'] = tdata
159
+ tableinfo.append(info)
160
+ return tableinfo
undatum/utils.py ADDED
@@ -0,0 +1,298 @@
1
+ # -*- coding: utf8 -*-
2
+ """Utility functions for file operations and data processing.
3
+
4
+ This module provides helper functions for encoding detection, delimiter detection,
5
+ file type identification, dictionary manipulation, and data type guessing.
6
+ """
7
+ from collections import OrderedDict
8
+ from typing import Any, Dict, List, Optional, Union
9
+
10
+ import chardet
11
+
12
+ from .constants import DEFAULT_OPTIONS, SUPPORTED_FILE_TYPES
13
+
14
+
15
+ def detect_encoding(filename: str, limit: int = 1000000) -> Dict[str, Any]:
16
+ """Detect encoding of a file.
17
+
18
+ Args:
19
+ filename: Path to the file to analyze.
20
+ limit: Maximum number of bytes to read for detection (default: 1000000).
21
+
22
+ Returns:
23
+ Dictionary with encoding detection results from chardet.
24
+ """
25
+ with open(filename, 'rb') as f:
26
+ chunk = f.read(limit)
27
+ detected = chardet.detect(chunk)
28
+ return detected
29
+
30
+
31
+ def detect_delimiter(filename: str, encoding: str = 'utf8') -> str:
32
+ """Detect delimiter used in a CSV-like file.
33
+
34
+ Args:
35
+ filename: Path to the CSV file to analyze.
36
+ encoding: File encoding (default: 'utf8').
37
+
38
+ Returns:
39
+ Most likely delimiter character (',', ';', '\\t', or '|').
40
+ """
41
+ with open(filename, 'r', encoding=encoding) as f:
42
+ line = f.readline()
43
+ dict1 = {',': line.count(','), ';': line.count(';'),
44
+ '\t': line.count('\t'), '|': line.count('|')}
45
+ delimiter = max(dict1, key=dict1.get)
46
+ return delimiter
47
+
48
+
49
+ def get_file_type(filename: str) -> Optional[str]:
50
+ """Get file type based on extension.
51
+
52
+ Args:
53
+ filename: Path to the file.
54
+
55
+ Returns:
56
+ File extension if supported, None otherwise.
57
+ """
58
+ ext = filename.rsplit('.', 1)[-1].lower()
59
+ if ext in SUPPORTED_FILE_TYPES:
60
+ return ext
61
+ return None
62
+
63
+
64
+ def get_option(options: Dict[str, Any], name: str) -> Any:
65
+ """Get option value from options dict or default options.
66
+
67
+ Args:
68
+ options: Dictionary of user-provided options.
69
+ name: Option name to retrieve.
70
+
71
+ Returns:
72
+ Option value if found, None otherwise.
73
+ """
74
+ if name in options:
75
+ return options[name]
76
+ if name in DEFAULT_OPTIONS:
77
+ return DEFAULT_OPTIONS[name]
78
+ return None
79
+
80
+ def get_dict_value(d: Union[Dict[str, Any], List[Dict[str, Any]], None], keys: List[str]) -> List[Any]:
81
+ """Get dictionary value by nested keys.
82
+
83
+ Args:
84
+ d: Dictionary or list of dictionaries to search.
85
+ keys: List of nested keys to traverse.
86
+
87
+ Returns:
88
+ List of values found at the specified key path.
89
+ """
90
+ out = []
91
+ if d is None:
92
+ return out
93
+ if len(keys) == 1:
94
+ if isinstance(d, (dict, OrderedDict)):
95
+ if keys[0] in d:
96
+ out.append(d[keys[0]])
97
+ else:
98
+ for r in d:
99
+ if r and keys[0] in r:
100
+ out.append(r[keys[0]])
101
+ else:
102
+ if isinstance(d, (dict, OrderedDict)):
103
+ if keys[0] in d:
104
+ out.extend(get_dict_value(d[keys[0]], keys[1:]))
105
+ else:
106
+ for r in d:
107
+ if keys[0] in r:
108
+ out.extend(get_dict_value(r[keys[0]], keys[1:]))
109
+ return out
110
+
111
+
112
+ def strip_dict_fields(record: Dict[str, Any], fields: List[List[str]], startkey: int = 0) -> Dict[str, Any]:
113
+ """Strip dictionary fields based on field list.
114
+
115
+ Args:
116
+ record: Dictionary to process.
117
+ fields: List of field paths (nested keys as lists).
118
+ startkey: Starting index for field path (default: 0).
119
+
120
+ Returns:
121
+ Modified dictionary with only specified fields retained.
122
+ """
123
+ # Create set for O(1) lookup instead of O(n) list lookup
124
+ localf = set()
125
+ for field in fields:
126
+ if len(field) > startkey:
127
+ localf.add(field[startkey])
128
+ # Iterate over copy of keys to avoid modification during iteration
129
+ keys = list(record.keys())
130
+ for k in keys:
131
+ if k not in localf:
132
+ del record[k]
133
+
134
+ for k in record:
135
+ if isinstance(record[k], dict):
136
+ record[k] = strip_dict_fields(record[k], fields, startkey + 1)
137
+ return record
138
+
139
+
140
+ def dict_generator(indict: Union[Dict[str, Any], Any], pre: Optional[List[str]] = None):
141
+ """Process dictionary and yield flattened key-value pairs.
142
+
143
+ Recursively traverses nested dictionaries and lists, yielding
144
+ key paths with their values. Skips '_id' keys.
145
+
146
+ Args:
147
+ indict: Input dictionary to process.
148
+ pre: Prefix keys list for nested structures (default: None).
149
+
150
+ Yields:
151
+ Lists containing key path and value: [key1, key2, ..., value]
152
+ """
153
+ pre = pre[:] if pre else []
154
+ if isinstance(indict, dict):
155
+ for key, value in indict.items(): # Use dict.items() directly, no list() conversion
156
+ if key == "_id":
157
+ continue
158
+ if isinstance(value, dict):
159
+ yield from dict_generator(value, pre + [key])
160
+ elif isinstance(value, (list, tuple)):
161
+ for v in value:
162
+ if isinstance(v, dict):
163
+ yield from dict_generator(v, pre + [key])
164
+ else:
165
+ yield pre + [key, value]
166
+ else:
167
+ yield indict
168
+
169
+
170
+ def guess_int_size(i: int) -> str:
171
+ """Guess appropriate integer size type based on value.
172
+
173
+ Args:
174
+ i: Integer value to analyze.
175
+
176
+ Returns:
177
+ String indicating size type: 'uint8', 'uint16', or 'uint32'.
178
+ """
179
+ if i < 255:
180
+ return 'uint8'
181
+ if i < 65535:
182
+ return 'uint16'
183
+ return 'uint32'
184
+
185
+
186
+ def guess_datatype(s: Union[str, int, float, None], qd: Any) -> Dict[str, Any]:
187
+ """Guess data type of a string value.
188
+
189
+ Analyzes a string to determine if it represents an integer, float,
190
+ date, empty value, or remains a string.
191
+
192
+ Args:
193
+ s: Value to analyze (can be string, int, float, or None).
194
+ qd: Query date matcher object for date detection.
195
+
196
+ Returns:
197
+ Dictionary with 'base' key indicating detected type and optional
198
+ 'subtype' or 'pat' keys for additional information.
199
+ """
200
+ attrs = {'base': 'str'}
201
+ if s is None:
202
+ return {'base': 'empty'}
203
+ if isinstance(s, int):
204
+ return {'base': 'int'}
205
+ if isinstance(s, float):
206
+ return {'base': 'float'}
207
+ if not isinstance(s, str):
208
+ return {'base': 'typed'}
209
+ if s.isdigit():
210
+ if s[0] == '0':
211
+ attrs = {'base': 'numstr'}
212
+ else:
213
+ attrs = {'base': 'int', 'subtype': guess_int_size(int(s))}
214
+ else:
215
+ try:
216
+ float(s)
217
+ attrs = {'base': 'float'}
218
+ return attrs
219
+ except ValueError:
220
+ pass
221
+ if qd:
222
+ is_date = False
223
+ res = qd.match(s)
224
+ if res:
225
+ attrs = {'base': 'date', 'pat': res['pattern']}
226
+ is_date = True
227
+ if not is_date:
228
+ if len(s.strip()) == 0:
229
+ attrs = {'base': 'empty'}
230
+ return attrs
231
+
232
+
233
+ def buf_count_newlines_gen(fname: str) -> int:
234
+ """Count newlines in a file using buffered reading.
235
+
236
+ Efficiently counts newline characters in large files by reading
237
+ in chunks rather than loading entire file into memory.
238
+
239
+ Args:
240
+ fname: Path to the file to analyze.
241
+
242
+ Returns:
243
+ Integer count of newline characters in the file.
244
+ """
245
+ def _make_gen(reader):
246
+ while True:
247
+ b = reader(2 ** 16)
248
+ if not b:
249
+ break
250
+ yield b
251
+
252
+ with open(fname, "rb") as f:
253
+ count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
254
+ return count
255
+
256
+
257
+ def get_dict_keys(iterable: Any, limit: int = 1000) -> List[str]:
258
+ """Get all unique dictionary keys from an iterable of dictionaries.
259
+
260
+ Extracts all nested keys from dictionaries, flattening them with dot notation.
261
+ Uses set for O(1) lookup performance instead of O(n) list operations.
262
+
263
+ Args:
264
+ iterable: Iterable of dictionaries to process.
265
+ limit: Maximum number of items to process (default: 1000).
266
+
267
+ Returns:
268
+ List of unique flattened key paths (e.g., ['field1', 'field2.subfield']).
269
+ """
270
+ n = 0
271
+ keys_set = set() # Use set for O(1) lookup instead of O(n) list operations
272
+ for item in iterable:
273
+ if limit and n > limit:
274
+ break
275
+ n += 1
276
+ dk = dict_generator(item)
277
+ for i in dk:
278
+ k = ".".join(i[:-1])
279
+ keys_set.add(k)
280
+ return list(keys_set) # Convert to list for backward compatibility
281
+
282
+
283
+ def _is_flat(item: Dict[str, Any]) -> bool:
284
+ """Check if dictionary contains only flat (non-nested) values.
285
+
286
+ Args:
287
+ item: Dictionary to check.
288
+
289
+ Returns:
290
+ True if dictionary contains no nested structures, False otherwise.
291
+ """
292
+ for v in item.values():
293
+ if isinstance(v, (tuple, list)):
294
+ return False
295
+ if isinstance(v, dict):
296
+ if not _is_flat(v):
297
+ return False
298
+ return True
@@ -0,0 +1,11 @@
1
+ # -*- coding: utf8 -*-
2
+ """Validation rules module for undatum."""
3
+ from .ruscodes import _check_inn, _check_ogrn
4
+ from .commonrules import _validate_email, _validate_url
5
+
6
+ VALIDATION_RULEMAP = {
7
+ 'ru.org.ogrn': _check_ogrn,
8
+ 'ru.org.inn': _check_inn,
9
+ 'common.email': _validate_email,
10
+ 'common.url': _validate_url
11
+ }
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf8 -*-
2
+ """Common validation rules for email and URL."""
3
+ from email.utils import parseaddr
4
+ import validators.url
5
+
6
+
7
+ def _validate_email(s):
8
+ """Validate email address."""
9
+ return '@' in parseaddr(s)[1]
10
+
11
+
12
+ def _validate_url(s):
13
+ """Validate URL."""
14
+ r = validators.url(s)
15
+ return r is True