undatum 1.0.17__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- undatum/__init__.py +9 -0
- undatum/__main__.py +25 -0
- undatum/ai/__init__.py +145 -0
- undatum/ai/base.py +85 -0
- undatum/ai/config.py +184 -0
- undatum/ai/perplexity.py +79 -0
- undatum/ai/providers.py +1002 -0
- undatum/ai/schemas.py +42 -0
- undatum/cmds/__init__.py +6 -0
- undatum/cmds/analyzer.py +697 -0
- undatum/cmds/converter.py +646 -0
- undatum/cmds/ingester.py +116 -0
- undatum/cmds/query.py +68 -0
- undatum/cmds/schemer.py +328 -0
- undatum/cmds/selector.py +437 -0
- undatum/cmds/statistics.py +158 -0
- undatum/cmds/textproc.py +59 -0
- undatum/cmds/transformer.py +81 -0
- undatum/cmds/validator.py +137 -0
- undatum/common/__init__.py +6 -0
- undatum/common/functions.py +81 -0
- undatum/common/iterable.py +222 -0
- undatum/common/scheme.py +261 -0
- undatum/constants.py +21 -0
- undatum/core.py +616 -0
- undatum/formats/__init__.py +6 -0
- undatum/formats/docx.py +160 -0
- undatum/utils.py +298 -0
- undatum/validate/__init__.py +11 -0
- undatum/validate/commonrules.py +15 -0
- undatum/validate/ruscodes.py +202 -0
- undatum-1.0.17.dist-info/METADATA +610 -0
- undatum-1.0.17.dist-info/RECORD +37 -0
- undatum-1.0.17.dist-info/WHEEL +6 -0
- undatum-1.0.17.dist-info/entry_points.txt +3 -0
- undatum-1.0.17.dist-info/licenses/LICENSE +21 -0
- undatum-1.0.17.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,610 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: undatum
|
|
3
|
+
Version: 1.0.17
|
|
4
|
+
Summary: A powerful command-line tool for data processing and analysis
|
|
5
|
+
Home-page: https://github.com/datacoon/undatum/
|
|
6
|
+
Download-URL: https://github.com/datacoon/undatum/
|
|
7
|
+
Author: Ivan Begtin
|
|
8
|
+
Author-email: Ivan Begtin <ivan@begtin.tech>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: json,jsonl,csv,bson,cli,dataset,data-processing
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Environment :: Console
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: System Administrators
|
|
18
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
19
|
+
Classifier: Topic :: Software Development
|
|
20
|
+
Classifier: Topic :: System :: Networking
|
|
21
|
+
Classifier: Topic :: Terminals
|
|
22
|
+
Classifier: Topic :: Text Processing
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Dynamic: author
|
|
28
|
+
Dynamic: download-url
|
|
29
|
+
Dynamic: home-page
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
|
|
33
|
+
# undatum
|
|
34
|
+
|
|
35
|
+
> A powerful command-line tool for data processing and analysis
|
|
36
|
+
|
|
37
|
+
**undatum** (pronounced *un-da-tum*) is a modern CLI tool designed to make working with large datasets as simple and efficient as possible. It provides a unified interface for converting, analyzing, validating, and transforming data across multiple formats.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **Multi-format support**: CSV, JSON Lines, BSON, XML, XLS, XLSX, Parquet, AVRO, ORC
|
|
42
|
+
- **Compression support**: ZIP, XZ, GZ, BZ2, ZSTD
|
|
43
|
+
- **Low memory footprint**: Streams data for efficient processing of large files
|
|
44
|
+
- **Automatic detection**: Encoding, delimiters, and file types
|
|
45
|
+
- **Data validation**: Built-in rules for emails, URLs, and custom validators
|
|
46
|
+
- **Advanced statistics**: Field analysis, frequency calculations, and date detection
|
|
47
|
+
- **Flexible filtering**: Query and filter data using expressions
|
|
48
|
+
- **Schema generation**: Automatic schema detection and generation
|
|
49
|
+
- **AI-powered documentation**: Automatic field and dataset descriptions using multiple LLM providers (OpenAI, OpenRouter, Ollama, LM Studio, Perplexity) with structured JSON output
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
### Using pip (Recommended)
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install --upgrade pip setuptools
|
|
57
|
+
pip install undatum
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Requirements
|
|
61
|
+
|
|
62
|
+
- Python 3.8 or greater
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Get file headers
|
|
68
|
+
undatum headers data.jsonl
|
|
69
|
+
|
|
70
|
+
# Analyze file structure
|
|
71
|
+
undatum analyze data.jsonl
|
|
72
|
+
|
|
73
|
+
# Get statistics
|
|
74
|
+
undatum stats data.csv
|
|
75
|
+
|
|
76
|
+
# Convert XML to JSON Lines
|
|
77
|
+
undatum convert --tagname item data.xml data.jsonl
|
|
78
|
+
|
|
79
|
+
# Get unique values
|
|
80
|
+
undatum uniq --fields category data.jsonl
|
|
81
|
+
|
|
82
|
+
# Calculate frequency
|
|
83
|
+
undatum frequency --fields status data.csv
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Commands
|
|
87
|
+
|
|
88
|
+
### `analyze`
|
|
89
|
+
|
|
90
|
+
Analyzes data files and provides human-readable insights about structure, encoding, fields, and data types. With `--autodoc`, automatically generates field descriptions and dataset summaries using AI.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# Basic analysis
|
|
94
|
+
undatum analyze data.jsonl
|
|
95
|
+
|
|
96
|
+
# With AI-powered documentation
|
|
97
|
+
undatum analyze data.jsonl --autodoc
|
|
98
|
+
|
|
99
|
+
# Using specific AI provider
|
|
100
|
+
undatum analyze data.jsonl --autodoc --ai-provider openai --ai-model gpt-4o-mini
|
|
101
|
+
|
|
102
|
+
# Output to file
|
|
103
|
+
undatum analyze data.jsonl --output report.yaml --autodoc
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Output includes:**
|
|
107
|
+
- File type, encoding, compression
|
|
108
|
+
- Number of records and fields
|
|
109
|
+
- Field types and structure
|
|
110
|
+
- Table detection for nested data (JSON/XML)
|
|
111
|
+
- AI-generated field descriptions (with `--autodoc`)
|
|
112
|
+
- AI-generated dataset summary (with `--autodoc`)
|
|
113
|
+
|
|
114
|
+
**AI Provider Options:**
|
|
115
|
+
- `--ai-provider`: Choose provider (openai, openrouter, ollama, lmstudio, perplexity)
|
|
116
|
+
- `--ai-model`: Specify model name (provider-specific)
|
|
117
|
+
- `--ai-base-url`: Custom API endpoint URL
|
|
118
|
+
|
|
119
|
+
**Supported AI Providers:**
|
|
120
|
+
|
|
121
|
+
1. **OpenAI** (default if `OPENAI_API_KEY` is set)
|
|
122
|
+
```bash
|
|
123
|
+
export OPENAI_API_KEY=sk-...
|
|
124
|
+
undatum analyze data.csv --autodoc --ai-provider openai --ai-model gpt-4o-mini
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
2. **OpenRouter** (supports multiple models via unified API)
|
|
128
|
+
```bash
|
|
129
|
+
export OPENROUTER_API_KEY=sk-or-...
|
|
130
|
+
undatum analyze data.csv --autodoc --ai-provider openrouter --ai-model openai/gpt-4o-mini
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
3. **Ollama** (local models, no API key required)
|
|
134
|
+
```bash
|
|
135
|
+
# Start Ollama and pull a model first: ollama pull llama3.2
|
|
136
|
+
undatum analyze data.csv --autodoc --ai-provider ollama --ai-model llama3.2
|
|
137
|
+
# Or set custom URL: export OLLAMA_BASE_URL=http://localhost:11434
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
4. **LM Studio** (local models, OpenAI-compatible API)
|
|
141
|
+
```bash
|
|
142
|
+
# Start LM Studio and load a model
|
|
143
|
+
undatum analyze data.csv --autodoc --ai-provider lmstudio --ai-model local-model
|
|
144
|
+
# Or set custom URL: export LMSTUDIO_BASE_URL=http://localhost:1234/v1
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
5. **Perplexity** (backward compatible, uses `PERPLEXITY_API_KEY`)
|
|
148
|
+
```bash
|
|
149
|
+
export PERPLEXITY_API_KEY=pplx-...
|
|
150
|
+
undatum analyze data.csv --autodoc --ai-provider perplexity
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
**Configuration Methods:**
|
|
154
|
+
|
|
155
|
+
AI provider can be configured via:
|
|
156
|
+
1. **Environment variables** (lowest precedence):
|
|
157
|
+
```bash
|
|
158
|
+
export UNDATUM_AI_PROVIDER=openai
|
|
159
|
+
export OPENAI_API_KEY=sk-...
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
2. **Config file** (medium precedence):
|
|
163
|
+
Create `undatum.yaml` in your project root or `~/.undatum/config.yaml`:
|
|
164
|
+
```yaml
|
|
165
|
+
ai:
|
|
166
|
+
provider: openai
|
|
167
|
+
api_key: ${OPENAI_API_KEY} # Can reference env vars
|
|
168
|
+
model: gpt-4o-mini
|
|
169
|
+
timeout: 30
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
3. **CLI arguments** (highest precedence):
|
|
173
|
+
```bash
|
|
174
|
+
undatum analyze data.csv --autodoc --ai-provider openai --ai-model gpt-4o-mini
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### `convert`
|
|
178
|
+
|
|
179
|
+
Converts data between different formats. Supports CSV, JSON Lines, BSON, XML, XLS, XLSX, Parquet, AVRO, and ORC.
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# XML to JSON Lines
|
|
183
|
+
undatum convert --tagname item data.xml data.jsonl
|
|
184
|
+
|
|
185
|
+
# CSV to Parquet
|
|
186
|
+
undatum convert data.csv data.parquet
|
|
187
|
+
|
|
188
|
+
# JSON Lines to CSV
|
|
189
|
+
undatum convert data.jsonl data.csv
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Supported conversions:**
|
|
193
|
+
|
|
194
|
+
| From / To | CSV | JSONL | BSON | JSON | XLS | XLSX | XML | Parquet | ORC | AVRO |
|
|
195
|
+
|-----------|-----|-------|------|------|-----|------|-----|---------|-----|------|
|
|
196
|
+
| CSV | - | ✓ | ✓ | - | - | - | - | ✓ | ✓ | ✓ |
|
|
197
|
+
| JSONL | ✓ | - | - | - | - | - | - | ✓ | ✓ | - |
|
|
198
|
+
| BSON | - | ✓ | - | - | - | - | - | - | - | - |
|
|
199
|
+
| JSON | - | ✓ | - | - | - | - | - | - | - | - |
|
|
200
|
+
| XLS | - | ✓ | ✓ | - | - | - | - | - | - | - |
|
|
201
|
+
| XLSX | - | ✓ | ✓ | - | - | - | - | - | - | - |
|
|
202
|
+
| XML | - | ✓ | - | - | - | - | - | - | - | - |
|
|
203
|
+
|
|
204
|
+
### `headers`
|
|
205
|
+
|
|
206
|
+
Extracts field names from data files. Works with CSV, JSON Lines, BSON, and XML files.
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
undatum headers data.jsonl
|
|
210
|
+
undatum headers data.csv --limit 50000
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### `stats`
|
|
214
|
+
|
|
215
|
+
Generates detailed statistics about your dataset including field types, uniqueness, lengths, and more.
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
undatum stats data.jsonl
|
|
219
|
+
undatum stats data.csv --checkdates
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**Statistics include:**
|
|
223
|
+
- Field types and array flags
|
|
224
|
+
- Unique value counts and percentages
|
|
225
|
+
- Min/max/average lengths
|
|
226
|
+
- Date field detection
|
|
227
|
+
|
|
228
|
+
### `frequency`
|
|
229
|
+
|
|
230
|
+
Calculates frequency distribution for specified fields.
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
undatum frequency --fields category data.jsonl
|
|
234
|
+
undatum frequency --fields status,region data.csv
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### `uniq`
|
|
238
|
+
|
|
239
|
+
Extracts all unique values from specified field(s).
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
# Single field
|
|
243
|
+
undatum uniq --fields category data.jsonl
|
|
244
|
+
|
|
245
|
+
# Multiple fields (unique combinations)
|
|
246
|
+
undatum uniq --fields status,region data.jsonl
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### `select`
|
|
250
|
+
|
|
251
|
+
Selects and reorders columns from files. Supports filtering.
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
undatum select --fields name,email,status data.jsonl
|
|
255
|
+
undatum select --fields name,email --filter "`status` == 'active'" data.jsonl
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### `split`
|
|
259
|
+
|
|
260
|
+
Splits datasets into multiple files based on chunk size or field values.
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
# Split by chunk size
|
|
264
|
+
undatum split --chunksize 10000 data.jsonl
|
|
265
|
+
|
|
266
|
+
# Split by field value
|
|
267
|
+
undatum split --fields category data.jsonl
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
### `validate`
|
|
271
|
+
|
|
272
|
+
Validates data against built-in or custom validation rules.
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
# Validate email addresses
|
|
276
|
+
undatum validate --rule common.email --fields email data.jsonl
|
|
277
|
+
|
|
278
|
+
# Validate Russian INN
|
|
279
|
+
undatum validate --rule ru.org.inn --fields VendorINN data.jsonl --mode stats
|
|
280
|
+
|
|
281
|
+
# Output invalid records
|
|
282
|
+
undatum validate --rule ru.org.inn --fields VendorINN data.jsonl --mode invalid
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
**Available validation rules:**
|
|
286
|
+
- `common.email` - Email address validation
|
|
287
|
+
- `common.url` - URL validation
|
|
288
|
+
- `ru.org.inn` - Russian organization INN identifier
|
|
289
|
+
- `ru.org.ogrn` - Russian organization OGRN identifier
|
|
290
|
+
|
|
291
|
+
### `schema`
|
|
292
|
+
|
|
293
|
+
Generates data schemas from files. Supports Cerberus and other schema formats.
|
|
294
|
+
|
|
295
|
+
```bash
|
|
296
|
+
undatum schema data.jsonl
|
|
297
|
+
undatum schema data.jsonl --output schema.yaml
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
### `query`
|
|
301
|
+
|
|
302
|
+
Query data using MistQL query language (experimental).
|
|
303
|
+
|
|
304
|
+
```bash
|
|
305
|
+
undatum query data.jsonl "SELECT * WHERE status = 'active'"
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### `flatten`
|
|
309
|
+
|
|
310
|
+
Flattens nested data structures into key-value pairs.
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
undatum flatten data.jsonl
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
### `apply`
|
|
317
|
+
|
|
318
|
+
Applies a transformation script to each record in the file.
|
|
319
|
+
|
|
320
|
+
```bash
|
|
321
|
+
undatum apply --script transform.py data.jsonl output.jsonl
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
## Advanced Usage
|
|
325
|
+
|
|
326
|
+
### Working with Compressed Files
|
|
327
|
+
|
|
328
|
+
undatum can process files inside compressed containers (ZIP, GZ, BZ2, XZ, ZSTD) with minimal memory usage.
|
|
329
|
+
|
|
330
|
+
```bash
|
|
331
|
+
# Process file inside ZIP archive
|
|
332
|
+
undatum headers --format-in jsonl data.zip
|
|
333
|
+
|
|
334
|
+
# Process XZ compressed file
|
|
335
|
+
undatum uniq --fields country --format-in jsonl data.jsonl.xz
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Filtering Data
|
|
339
|
+
|
|
340
|
+
Most commands support filtering using expressions:
|
|
341
|
+
|
|
342
|
+
```bash
|
|
343
|
+
# Filter by field value
|
|
344
|
+
undatum select --fields name,email --filter "`status` == 'active'" data.jsonl
|
|
345
|
+
|
|
346
|
+
# Complex filters
|
|
347
|
+
undatum frequency --fields category --filter "`price` > 100" data.jsonl
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
**Filter syntax:**
|
|
351
|
+
- Field names: `` `fieldname` ``
|
|
352
|
+
- String values: `'value'`
|
|
353
|
+
- Operators: `==`, `!=`, `>`, `<`, `>=`, `<=`, `and`, `or`
|
|
354
|
+
|
|
355
|
+
### Date Detection
|
|
356
|
+
|
|
357
|
+
Automatic date/datetime field detection:
|
|
358
|
+
|
|
359
|
+
```bash
|
|
360
|
+
undatum stats --checkdates data.jsonl
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
This uses the `qddate` library to automatically identify and parse date fields.
|
|
364
|
+
|
|
365
|
+
### Custom Encoding and Delimiters
|
|
366
|
+
|
|
367
|
+
Override automatic detection:
|
|
368
|
+
|
|
369
|
+
```bash
|
|
370
|
+
undatum headers --encoding cp1251 --delimiter ";" data.csv
|
|
371
|
+
undatum convert --encoding utf-8 --delimiter "," data.csv data.jsonl
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
## Data Formats
|
|
375
|
+
|
|
376
|
+
### JSON Lines (JSONL)
|
|
377
|
+
|
|
378
|
+
JSON Lines is a text format where each line is a valid JSON object. It combines JSON flexibility with line-by-line processing capabilities, making it ideal for large datasets.
|
|
379
|
+
|
|
380
|
+
```jsonl
|
|
381
|
+
{"name": "Alice", "age": 30}
|
|
382
|
+
{"name": "Bob", "age": 25}
|
|
383
|
+
{"name": "Charlie", "age": 35}
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
### CSV
|
|
387
|
+
|
|
388
|
+
Standard comma-separated values format. undatum automatically detects delimiters (comma, semicolon, tab) and encoding.
|
|
389
|
+
|
|
390
|
+
### BSON
|
|
391
|
+
|
|
392
|
+
Binary JSON format used by MongoDB. Efficient for binary data storage.
|
|
393
|
+
|
|
394
|
+
### XML
|
|
395
|
+
|
|
396
|
+
XML files can be converted to JSON Lines by specifying the tag name containing records.
|
|
397
|
+
|
|
398
|
+
## AI Provider Troubleshooting
|
|
399
|
+
|
|
400
|
+
### Common Issues
|
|
401
|
+
|
|
402
|
+
**Provider not found:**
|
|
403
|
+
```bash
|
|
404
|
+
# Error: No AI provider specified
|
|
405
|
+
# Solution: Set environment variable or use --ai-provider
|
|
406
|
+
export UNDATUM_AI_PROVIDER=openai
|
|
407
|
+
# or
|
|
408
|
+
undatum analyze data.csv --autodoc --ai-provider openai
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
**API key not found:**
|
|
412
|
+
```bash
|
|
413
|
+
# Error: API key is required
|
|
414
|
+
# Solution: Set provider-specific API key
|
|
415
|
+
export OPENAI_API_KEY=sk-...
|
|
416
|
+
export OPENROUTER_API_KEY=sk-or-...
|
|
417
|
+
export PERPLEXITY_API_KEY=pplx-...
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
**Ollama connection failed:**
|
|
421
|
+
```bash
|
|
422
|
+
# Error: Connection refused
|
|
423
|
+
# Solution: Ensure Ollama is running and model is pulled
|
|
424
|
+
ollama serve
|
|
425
|
+
ollama pull llama3.2
|
|
426
|
+
# Or specify custom URL
|
|
427
|
+
export OLLAMA_BASE_URL=http://localhost:11434
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
**LM Studio connection failed:**
|
|
431
|
+
```bash
|
|
432
|
+
# Error: Connection refused
|
|
433
|
+
# Solution: Start LM Studio server and load a model
|
|
434
|
+
# In LM Studio: Start Server, then:
|
|
435
|
+
export LMSTUDIO_BASE_URL=http://localhost:1234/v1
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
**Structured output errors:**
|
|
439
|
+
- All providers now use JSON Schema for reliable parsing
|
|
440
|
+
- If a provider doesn't support structured output, it will fall back gracefully
|
|
441
|
+
- Check provider documentation for model compatibility
|
|
442
|
+
|
|
443
|
+
### Provider-Specific Notes
|
|
444
|
+
|
|
445
|
+
- **OpenAI**: Requires API key, supports `gpt-4o-mini`, `gpt-4o`, `gpt-3.5-turbo`, etc.
|
|
446
|
+
- **OpenRouter**: Unified API for multiple providers, supports models from OpenAI, Anthropic, Google, etc.
|
|
447
|
+
- **Ollama**: Local models, no API key needed, but requires Ollama to be installed and running
|
|
448
|
+
- **LM Studio**: Local models, OpenAI-compatible API, requires LM Studio to be running
|
|
449
|
+
- **Perplexity**: Requires API key, uses `sonar` model by default
|
|
450
|
+
|
|
451
|
+
## Performance Tips
|
|
452
|
+
|
|
453
|
+
1. **Use appropriate formats**: Parquet/ORC for analytics, JSONL for streaming
|
|
454
|
+
2. **Compression**: Use ZSTD or GZIP for better compression ratios
|
|
455
|
+
3. **Chunking**: Split large files for parallel processing
|
|
456
|
+
4. **Filtering**: Apply filters early to reduce data volume
|
|
457
|
+
5. **Streaming**: undatum streams data by default for low memory usage
|
|
458
|
+
6. **AI Documentation**: Use local providers (Ollama/LM Studio) for faster, free documentation generation
|
|
459
|
+
7. **Batch Processing**: AI descriptions are generated per-table, consider splitting large datasets
|
|
460
|
+
|
|
461
|
+
## AI-Powered Documentation
|
|
462
|
+
|
|
463
|
+
The `analyze` command can automatically generate field descriptions and dataset summaries using AI when `--autodoc` is enabled. This feature supports multiple LLM providers and uses structured JSON output for reliable parsing.
|
|
464
|
+
|
|
465
|
+
### Quick Examples
|
|
466
|
+
|
|
467
|
+
```bash
|
|
468
|
+
# Basic AI documentation (auto-detects provider from environment)
|
|
469
|
+
undatum analyze data.csv --autodoc
|
|
470
|
+
|
|
471
|
+
# Use OpenAI with specific model
|
|
472
|
+
undatum analyze data.csv --autodoc --ai-provider openai --ai-model gpt-4o-mini
|
|
473
|
+
|
|
474
|
+
# Use local Ollama model
|
|
475
|
+
undatum analyze data.csv --autodoc --ai-provider ollama --ai-model llama3.2
|
|
476
|
+
|
|
477
|
+
# Use OpenRouter to access various models
|
|
478
|
+
undatum analyze data.csv --autodoc --ai-provider openrouter --ai-model anthropic/claude-3-haiku
|
|
479
|
+
|
|
480
|
+
# Output to YAML with AI descriptions
|
|
481
|
+
undatum analyze data.csv --autodoc --output schema.yaml --outtype yaml
|
|
482
|
+
```
|
|
483
|
+
|
|
484
|
+
### Configuration File Example
|
|
485
|
+
|
|
486
|
+
Create `undatum.yaml` in your project:
|
|
487
|
+
|
|
488
|
+
```yaml
|
|
489
|
+
ai:
|
|
490
|
+
provider: openai
|
|
491
|
+
model: gpt-4o-mini
|
|
492
|
+
timeout: 30
|
|
493
|
+
```
|
|
494
|
+
|
|
495
|
+
Or use `~/.undatum/config.yaml` for global settings:
|
|
496
|
+
|
|
497
|
+
```yaml
|
|
498
|
+
ai:
|
|
499
|
+
provider: ollama
|
|
500
|
+
model: llama3.2
|
|
501
|
+
ollama_base_url: http://localhost:11434
|
|
502
|
+
```
|
|
503
|
+
|
|
504
|
+
### Language Support
|
|
505
|
+
|
|
506
|
+
Generate descriptions in different languages:
|
|
507
|
+
|
|
508
|
+
```bash
|
|
509
|
+
# English (default)
|
|
510
|
+
undatum analyze data.csv --autodoc --lang English
|
|
511
|
+
|
|
512
|
+
# Russian
|
|
513
|
+
undatum analyze data.csv --autodoc --lang Russian
|
|
514
|
+
|
|
515
|
+
# Spanish
|
|
516
|
+
undatum analyze data.csv --autodoc --lang Spanish
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
### What Gets Generated
|
|
520
|
+
|
|
521
|
+
With `--autodoc` enabled, the analyzer will:
|
|
522
|
+
|
|
523
|
+
1. **Field Descriptions**: Generate clear, concise descriptions for each field explaining what it represents
|
|
524
|
+
2. **Dataset Summary**: Provide an overall description of the dataset based on sample data
|
|
525
|
+
|
|
526
|
+
Example output:
|
|
527
|
+
|
|
528
|
+
```yaml
|
|
529
|
+
tables:
|
|
530
|
+
- id: data.csv
|
|
531
|
+
fields:
|
|
532
|
+
- name: customer_id
|
|
533
|
+
ftype: VARCHAR
|
|
534
|
+
description: "Unique identifier for each customer"
|
|
535
|
+
- name: purchase_date
|
|
536
|
+
ftype: DATE
|
|
537
|
+
description: "Date when the purchase was made"
|
|
538
|
+
description: "Customer purchase records containing transaction details"
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
## Examples
|
|
542
|
+
|
|
543
|
+
### Data Pipeline Example
|
|
544
|
+
|
|
545
|
+
```bash
|
|
546
|
+
# 1. Analyze source data
|
|
547
|
+
undatum analyze source.xml
|
|
548
|
+
|
|
549
|
+
# 2. Convert to JSON Lines
|
|
550
|
+
undatum convert --tagname item source.xml data.jsonl
|
|
551
|
+
|
|
552
|
+
# 3. Validate data
|
|
553
|
+
undatum validate --rule common.email --fields email data.jsonl --mode invalid > invalid.jsonl
|
|
554
|
+
|
|
555
|
+
# 4. Get statistics
|
|
556
|
+
undatum stats data.jsonl > stats.json
|
|
557
|
+
|
|
558
|
+
# 5. Extract unique categories
|
|
559
|
+
undatum uniq --fields category data.jsonl > categories.txt
|
|
560
|
+
|
|
561
|
+
# 6. Convert to Parquet for analytics
|
|
562
|
+
undatum convert data.jsonl data.parquet
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
### Data Quality Check
|
|
566
|
+
|
|
567
|
+
```bash
|
|
568
|
+
# Check for duplicate emails
|
|
569
|
+
undatum frequency --fields email data.jsonl | grep -v "1$"
|
|
570
|
+
|
|
571
|
+
# Validate all required fields
|
|
572
|
+
undatum validate --rule common.email --fields email data.jsonl
|
|
573
|
+
undatum validate --rule common.url --fields website data.jsonl
|
|
574
|
+
|
|
575
|
+
# Generate schema with AI documentation
|
|
576
|
+
undatum schema data.jsonl --output schema.yaml --autodoc
|
|
577
|
+
```
|
|
578
|
+
|
|
579
|
+
### AI Documentation Workflow
|
|
580
|
+
|
|
581
|
+
```bash
|
|
582
|
+
# 1. Analyze dataset with AI-generated descriptions
|
|
583
|
+
undatum analyze sales_data.csv --autodoc --ai-provider openai --output analysis.yaml
|
|
584
|
+
|
|
585
|
+
# 2. Review generated field descriptions
|
|
586
|
+
cat analysis.yaml
|
|
587
|
+
|
|
588
|
+
# 3. Use descriptions in schema generation
|
|
589
|
+
undatum schema sales_data.csv --autodoc --output documented_schema.yaml
|
|
590
|
+
|
|
591
|
+
# 4. Bulk schema extraction with AI documentation
|
|
592
|
+
undatum schema_bulk ./data_dir --autodoc --output ./schemas --mode distinct
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
## Contributing
|
|
596
|
+
|
|
597
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
598
|
+
|
|
599
|
+
## License
|
|
600
|
+
|
|
601
|
+
MIT License - see LICENSE file for details.
|
|
602
|
+
|
|
603
|
+
## Links
|
|
604
|
+
|
|
605
|
+
- [GitHub Repository](https://github.com/datacoon/undatum)
|
|
606
|
+
- [Issue Tracker](https://github.com/datacoon/undatum/issues)
|
|
607
|
+
|
|
608
|
+
## Support
|
|
609
|
+
|
|
610
|
+
For questions, issues, or feature requests, please open an issue on GitHub.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
undatum/__init__.py,sha256=co5RiOtrWnVG-yw53ncMEzc4LSXBRiPF2AFShkLyKHg,198
|
|
2
|
+
undatum/__main__.py,sha256=uwmbA0FWcqwHdZ_SUwegPbkyTszTli1nK6ys-KQxp5U,514
|
|
3
|
+
undatum/constants.py,sha256=-6ZbXxF14clzMFUdL4qcQG2HU9u9LlCzqPMSCbgW0vU,799
|
|
4
|
+
undatum/core.py,sha256=QVDH2yIdY0hjc3U_kyJEybb5gqaUL4SNPeldZIDlpuY,29099
|
|
5
|
+
undatum/utils.py,sha256=ftdrNKj1_P7OE76UXUtw1rYguxlopMZS1YjPubQdxq8,8958
|
|
6
|
+
undatum/ai/__init__.py,sha256=9fKMA7o480pKWxlAr45X6alUlsupTdD402wa1DH-Obk,4356
|
|
7
|
+
undatum/ai/base.py,sha256=TPx1uxlUdpfaks7r3Za2WvNfYpuuugbbnfnz6Ut8QsI,2726
|
|
8
|
+
undatum/ai/config.py,sha256=ge2SssHD2qFIIudfiZApqkJcMe2SQO2CUpDDC6nPbHM,5484
|
|
9
|
+
undatum/ai/perplexity.py,sha256=aUoVmkG5KtpXa50KaX23axG94ydL3xaLOdYHjSXRGO4,2550
|
|
10
|
+
undatum/ai/providers.py,sha256=526rKIkd96smMAuWuPTt84bz1sZm8nIVCcFjdOLxNtg,44041
|
|
11
|
+
undatum/ai/schemas.py,sha256=rfuPwTNDJodCp5axXNvnHwTL5dZqePTpjnnEE5WM_UY,1288
|
|
12
|
+
undatum/cmds/__init__.py,sha256=1_o2mrs6pk6HoerxD3vWrZjTGb09I3wMW7D2TfdE1BE,225
|
|
13
|
+
undatum/cmds/analyzer.py,sha256=XQqnE5YpN2aYYcK4Qr_GEGMDK1AK3rhy577hi-FyttM,31397
|
|
14
|
+
undatum/cmds/converter.py,sha256=zCCvb1xKI2AG0gWTZN6Jnipvy0vEU0nAfVk-9YqlDtE,23887
|
|
15
|
+
undatum/cmds/ingester.py,sha256=ELeXFCmNufPfOPajHKRie3bQN7-NCdTz06-D95zyu_s,4139
|
|
16
|
+
undatum/cmds/query.py,sha256=8B2gEPEMAY-ZXgmOp-VPJ4-CaD10jZJxDYst-JLpIJA,2239
|
|
17
|
+
undatum/cmds/schemer.py,sha256=3u1hNHmcPPgCLlYtyxResW1NWzAnnmMzii05PpTWA-A,13785
|
|
18
|
+
undatum/cmds/selector.py,sha256=9Ow5I2M87oanYj528bYouM2JGiI2R86ZzxNLYzMMV3k,17697
|
|
19
|
+
undatum/cmds/statistics.py,sha256=qd2RAkRkU86O5H8KhANSehUbrf_dd9kz0Am04cMLQto,6727
|
|
20
|
+
undatum/cmds/textproc.py,sha256=zqpYeyIEhmOyEsDhteHagI-vweLt_pVZ48IFJKcQkmc,1842
|
|
21
|
+
undatum/cmds/transformer.py,sha256=iATunWi-6RZXKUe1QiZYeLVsOfSC4KPWG7Z1Im0a2rU,2461
|
|
22
|
+
undatum/cmds/validator.py,sha256=VJxqXbL0L3EOpcOqTIlNpHauHh8bAjlqCQCz6P-pkmo,5403
|
|
23
|
+
undatum/common/__init__.py,sha256=tLMXIqefowqBbHU6dHY_nE_VQW7duuetMYjUHVsOfys,240
|
|
24
|
+
undatum/common/functions.py,sha256=6JsGeNmGPJnMUq5C3xxniYMDYTkZfHr9KtsjXgrMvkM,2854
|
|
25
|
+
undatum/common/iterable.py,sha256=dLDyWtqEEnU5KnPBtAT9bY6z0bO2mwDtT5kU45jaRC4,8910
|
|
26
|
+
undatum/common/scheme.py,sha256=Yes_zRGmxvHmaG7iCPNb3XDThFBklD-c6qOx1UZhGkY,10461
|
|
27
|
+
undatum/formats/__init__.py,sha256=_RemWRfqoLFZ43uYxBwy5rgNp8NHcMjOMvHh2mNyjH0,187
|
|
28
|
+
undatum/formats/docx.py,sha256=quKcUqXAVxcsochfmYFRnFCBxbyRI4-PBiWlWHGgdrQ,5141
|
|
29
|
+
undatum/validate/__init__.py,sha256=PnmijHZIOitBIhdF-wka7zy0Qw0JHHkudpiRqVrLggM,325
|
|
30
|
+
undatum/validate/commonrules.py,sha256=oHSfXI_m_XoUwXBBjJS4eilUvpu_o-Ph9hMNgQbM8vw,317
|
|
31
|
+
undatum/validate/ruscodes.py,sha256=bzXyAlB1MnDO5rag2Q5BzsUCGVRPGwW0zfRwb4uNI0w,12927
|
|
32
|
+
undatum-1.0.17.dist-info/licenses/LICENSE,sha256=RiNCUql11GYeXe1ltOBHnbow_JUlg60WrlMQZj-fDCk,1105
|
|
33
|
+
undatum-1.0.17.dist-info/METADATA,sha256=mGTf42hqBsWNGOKJDS6kafHqMDjYY1_sq8Co9Mz0jKo,17032
|
|
34
|
+
undatum-1.0.17.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
|
|
35
|
+
undatum-1.0.17.dist-info/entry_points.txt,sha256=pc7sVxjlwi9UEHpatS0qWVuTJejy_E9otb73dmybvBs,79
|
|
36
|
+
undatum-1.0.17.dist-info/top_level.txt,sha256=fgoMX0hwN9HGxtJ7DOH9czBQy-1B3Yqq2M7lqX8AUgI,8
|
|
37
|
+
undatum-1.0.17.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2021 NGO "Informational Culture"
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
undatum
|