undatum 1.0.14__tar.gz → 1.0.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of undatum might be problematic. Click here for more details.
- {undatum-1.0.14/undatum.egg-info → undatum-1.0.15}/PKG-INFO +55 -7
- {undatum-1.0.14 → undatum-1.0.15}/README.rst +19 -1
- {undatum-1.0.14 → undatum-1.0.15}/setup.py +7 -1
- undatum-1.0.15/tests/test.py +12 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum/__init__.py +2 -1
- {undatum-1.0.14 → undatum-1.0.15}/undatum/__main__.py +4 -3
- undatum-1.0.15/undatum/ai/perplexity.py +78 -0
- undatum-1.0.15/undatum/cmds/analyzer.py +472 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/converter.py +194 -21
- undatum-1.0.15/undatum/cmds/ingester.py +110 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/query.py +3 -7
- undatum-1.0.15/undatum/cmds/schemer.py +274 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/selector.py +63 -36
- {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/statistics.py +31 -17
- {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/textproc.py +20 -8
- undatum-1.0.15/undatum/cmds/transformer.py +78 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/validator.py +3 -8
- undatum-1.0.15/undatum/common/__init__.py +1 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum/common/functions.py +2 -1
- {undatum-1.0.14 → undatum-1.0.15}/undatum/common/iterable.py +3 -3
- {undatum-1.0.14 → undatum-1.0.15}/undatum/common/scheme.py +1 -1
- {undatum-1.0.14 → undatum-1.0.15}/undatum/constants.py +3 -2
- undatum-1.0.15/undatum/core.py +350 -0
- undatum-1.0.15/undatum/formats/__init__.py +0 -0
- undatum-1.0.15/undatum/formats/docx.py +159 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum/utils.py +2 -34
- {undatum-1.0.14 → undatum-1.0.15}/undatum/validate/__init__.py +1 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum/validate/commonrules.py +2 -1
- {undatum-1.0.14 → undatum-1.0.15}/undatum/validate/ruscodes.py +1 -0
- {undatum-1.0.14 → undatum-1.0.15/undatum.egg-info}/PKG-INFO +55 -7
- {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/SOURCES.txt +6 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/requires.txt +5 -0
- undatum-1.0.14/undatum/cmds/analyzer.py +0 -284
- undatum-1.0.14/undatum/cmds/schemer.py +0 -46
- undatum-1.0.14/undatum/cmds/transformer.py +0 -94
- undatum-1.0.14/undatum/core.py +0 -444
- {undatum-1.0.14 → undatum-1.0.15}/AUTHORS.rst +0 -0
- {undatum-1.0.14 → undatum-1.0.15}/LICENSE +0 -0
- {undatum-1.0.14 → undatum-1.0.15}/setup.cfg +0 -0
- {undatum-1.0.14/undatum/cmds → undatum-1.0.15/undatum/ai}/__init__.py +0 -0
- {undatum-1.0.14/undatum/common → undatum-1.0.15/undatum/cmds}/__init__.py +0 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/dependency_links.txt +0 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/entry_points.txt +0 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/not-zip-safe +0 -0
- {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: undatum
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.15
|
|
4
4
|
Summary: undatum: a command-line tool for data processing. Brings CSV simplicity to JSON lines and BSON
|
|
5
5
|
Home-page: https://github.com/datacoon/undatum/
|
|
6
6
|
Download-URL: https://github.com/datacoon/undatum/
|
|
@@ -8,7 +8,6 @@ Author: Ivan Begtin
|
|
|
8
8
|
Author-email: ivan@begtin.tech
|
|
9
9
|
License: MIT
|
|
10
10
|
Keywords: json jsonl csv bson cli dataset
|
|
11
|
-
Platform: UNKNOWN
|
|
12
11
|
Classifier: Development Status :: 5 - Production/Stable
|
|
13
12
|
Classifier: Programming Language :: Python
|
|
14
13
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
@@ -24,9 +23,42 @@ Classifier: Topic :: Text Processing
|
|
|
24
23
|
Classifier: Topic :: Utilities
|
|
25
24
|
Requires-Python: >=3.8
|
|
26
25
|
Description-Content-Type: text/x-rst
|
|
27
|
-
Provides-Extra: python_version == "3.8" or python_version == "3.8"
|
|
28
26
|
License-File: LICENSE
|
|
29
27
|
License-File: AUTHORS.rst
|
|
28
|
+
Requires-Dist: chardet>=3.0.4
|
|
29
|
+
Requires-Dist: click>=8.0.3
|
|
30
|
+
Requires-Dist: dictquery>=0.4.0
|
|
31
|
+
Requires-Dist: jsonlines>=1.2.0
|
|
32
|
+
Requires-Dist: openpyxl>=3.0.5
|
|
33
|
+
Requires-Dist: orjson>=3.6.6
|
|
34
|
+
Requires-Dist: pandas>=1.1.3
|
|
35
|
+
Requires-Dist: pymongo>=3.11.0
|
|
36
|
+
Requires-Dist: qddate>=0.1.1
|
|
37
|
+
Requires-Dist: tabulate>=0.8.7
|
|
38
|
+
Requires-Dist: validators>=0.18.1
|
|
39
|
+
Requires-Dist: xlrd>=1.2.0
|
|
40
|
+
Requires-Dist: xmltodict
|
|
41
|
+
Requires-Dist: rich
|
|
42
|
+
Requires-Dist: duckdb
|
|
43
|
+
Requires-Dist: pyzstd
|
|
44
|
+
Requires-Dist: pydantic
|
|
45
|
+
Requires-Dist: typer
|
|
46
|
+
Provides-Extra: python-version-3-8-or-python-version-3-8
|
|
47
|
+
Requires-Dist: argparse>=1.2.1; extra == "python-version-3-8-or-python-version-3-8"
|
|
48
|
+
Dynamic: author
|
|
49
|
+
Dynamic: author-email
|
|
50
|
+
Dynamic: classifier
|
|
51
|
+
Dynamic: description
|
|
52
|
+
Dynamic: description-content-type
|
|
53
|
+
Dynamic: download-url
|
|
54
|
+
Dynamic: home-page
|
|
55
|
+
Dynamic: keywords
|
|
56
|
+
Dynamic: license
|
|
57
|
+
Dynamic: license-file
|
|
58
|
+
Dynamic: provides-extra
|
|
59
|
+
Dynamic: requires-dist
|
|
60
|
+
Dynamic: requires-python
|
|
61
|
+
Dynamic: summary
|
|
30
62
|
|
|
31
63
|
==================================================
|
|
32
64
|
undatum -- a command-line tool for data processing
|
|
@@ -52,7 +84,7 @@ Main features
|
|
|
52
84
|
* Common data operations against CSV, JSON lines and BSON files
|
|
53
85
|
* Built-in data filtering
|
|
54
86
|
* Support data compressed with ZIP, XZ, GZ, BZ2
|
|
55
|
-
* Conversion between CSV, JSONl, BSON, XML, XLS, XLSX, Parquet file types
|
|
87
|
+
* Conversion between CSV, JSONl, BSON, XML, XLS, XLSX, Parquet, AVRO and ORC file types
|
|
56
88
|
* Low memory footprint
|
|
57
89
|
* Support for compressed datasets
|
|
58
90
|
* Advanced statistics calculations
|
|
@@ -278,6 +310,24 @@ Converts CSV file feddomains.csv to Parquet file feddomains.parquet
|
|
|
278
310
|
$ undatum convert examples/feddomains.csv examples/feddomains.parquet
|
|
279
311
|
|
|
280
312
|
|
|
313
|
+
*Data formats conversion table map*
|
|
314
|
+
|
|
315
|
+
============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
|
|
316
|
+
From / To CSV JSONlines BSON JSON XLS XLSX XML Parquet ORC AVRO
|
|
317
|
+
============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
|
|
318
|
+
CSV - Yes Yes No No No No Yes Yes Yes
|
|
319
|
+
JSONlines Yes - No No No No No Yes Yes No
|
|
320
|
+
BSON No Yes - No No No No No No No
|
|
321
|
+
JSON No Yes No - No No No No No No
|
|
322
|
+
XLS No Yes Yes No - No No No No No
|
|
323
|
+
XLSX No Yes Yes No No - No No No No
|
|
324
|
+
XML No Yes No No No No - No No No
|
|
325
|
+
Parquet No No No No No No No - No No
|
|
326
|
+
ORC No No No No No No No No - No
|
|
327
|
+
AVRO No No No No No No No No No -
|
|
328
|
+
============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
|
|
329
|
+
|
|
330
|
+
|
|
281
331
|
Validate command
|
|
282
332
|
----------------
|
|
283
333
|
|
|
@@ -483,5 +533,3 @@ JSONl
|
|
|
483
533
|
-----
|
|
484
534
|
|
|
485
535
|
JSON lines is a replacement to CSV and JSON files, with JSON flexibility and ability to process data line by line, without loading everything into memory.
|
|
486
|
-
|
|
487
|
-
|
|
@@ -22,7 +22,7 @@ Main features
|
|
|
22
22
|
* Common data operations against CSV, JSON lines and BSON files
|
|
23
23
|
* Built-in data filtering
|
|
24
24
|
* Support data compressed with ZIP, XZ, GZ, BZ2
|
|
25
|
-
* Conversion between CSV, JSONl, BSON, XML, XLS, XLSX, Parquet file types
|
|
25
|
+
* Conversion between CSV, JSONl, BSON, XML, XLS, XLSX, Parquet, AVRO and ORC file types
|
|
26
26
|
* Low memory footprint
|
|
27
27
|
* Support for compressed datasets
|
|
28
28
|
* Advanced statistics calculations
|
|
@@ -248,6 +248,24 @@ Converts CSV file feddomains.csv to Parquet file feddomains.parquet
|
|
|
248
248
|
$ undatum convert examples/feddomains.csv examples/feddomains.parquet
|
|
249
249
|
|
|
250
250
|
|
|
251
|
+
*Data formats conversion table map*
|
|
252
|
+
|
|
253
|
+
============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
|
|
254
|
+
From / To CSV JSONlines BSON JSON XLS XLSX XML Parquet ORC AVRO
|
|
255
|
+
============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
|
|
256
|
+
CSV - Yes Yes No No No No Yes Yes Yes
|
|
257
|
+
JSONlines Yes - No No No No No Yes Yes No
|
|
258
|
+
BSON No Yes - No No No No No No No
|
|
259
|
+
JSON No Yes No - No No No No No No
|
|
260
|
+
XLS No Yes Yes No - No No No No No
|
|
261
|
+
XLSX No Yes Yes No No - No No No No
|
|
262
|
+
XML No Yes No No No No - No No No
|
|
263
|
+
Parquet No No No No No No No - No No
|
|
264
|
+
ORC No No No No No No No No - No
|
|
265
|
+
AVRO No No No No No No No No No -
|
|
266
|
+
============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
|
|
267
|
+
|
|
268
|
+
|
|
251
269
|
Validate command
|
|
252
270
|
----------------
|
|
253
271
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
1
2
|
# This is purely the result of trial and error.
|
|
2
3
|
|
|
3
4
|
import sys
|
|
@@ -46,7 +47,12 @@ install_requires = [
|
|
|
46
47
|
'tabulate>=0.8.7',
|
|
47
48
|
'validators>=0.18.1',
|
|
48
49
|
'xlrd>=1.2.0',
|
|
49
|
-
'xmltodict'
|
|
50
|
+
'xmltodict',
|
|
51
|
+
'rich',
|
|
52
|
+
'duckdb',
|
|
53
|
+
'pyzstd',
|
|
54
|
+
'pydantic',
|
|
55
|
+
'typer'
|
|
50
56
|
]
|
|
51
57
|
|
|
52
58
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from undatum.cmds.analyzer import duckdb_decompose
|
|
3
|
+
|
|
4
|
+
DATA = [
|
|
5
|
+
{"foo": 1, "bar": "some string", "baz": 1.23},
|
|
6
|
+
{"foo": 2, "bar": "some other string", "baz": 2.34},
|
|
7
|
+
{"foo": 3, "bar": "yet another string", "baz": 3.45},
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
df = pd.DataFrame(DATA)
|
|
11
|
+
|
|
12
|
+
print(duckdb_decompose(frame=df))
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# -*- coding: utf8 -*-
|
|
1
2
|
#!/usr/bin/env python
|
|
2
3
|
"""The main entry point. Invoke as `undatum' or `python -m undatum`.
|
|
3
4
|
|
|
@@ -7,12 +8,12 @@ import sys
|
|
|
7
8
|
|
|
8
9
|
def main():
|
|
9
10
|
try:
|
|
10
|
-
from .core import
|
|
11
|
-
|
|
11
|
+
from .core import app
|
|
12
|
+
app()
|
|
12
13
|
except KeyboardInterrupt:
|
|
13
14
|
print("Ctrl-C pressed. Aborting")
|
|
14
15
|
sys.exit(0)
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
if __name__ == '__main__':
|
|
18
|
-
|
|
19
|
+
app()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import csv
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
from io import StringIO
|
|
6
|
+
|
|
7
|
+
PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY', )
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def find_between( s, first, last ):
|
|
11
|
+
try:
|
|
12
|
+
start = s.index( first ) + len( first )
|
|
13
|
+
end = s.index( last, start )
|
|
14
|
+
return s[start:end]
|
|
15
|
+
except ValueError:
|
|
16
|
+
return ""
|
|
17
|
+
|
|
18
|
+
def get_fields_info(fields, language='English'):
|
|
19
|
+
"""Returns information about data fields"""
|
|
20
|
+
url = "https://api.perplexity.ai/chat/completions"
|
|
21
|
+
headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}"}
|
|
22
|
+
payload = {
|
|
23
|
+
"model": "sonar",
|
|
24
|
+
"messages": [
|
|
25
|
+
{"role": "system", "content": "Be precise and concise, provide data output only CSV or JSON, accrording to request"},
|
|
26
|
+
{"role": "user", "content": (
|
|
27
|
+
f"Please describe in {language} these fields delimited by comma: {fields}"
|
|
28
|
+
"Please output as single csv table only with following fields: name and description"
|
|
29
|
+
)},
|
|
30
|
+
],
|
|
31
|
+
"response_format": {
|
|
32
|
+
"type": "text",
|
|
33
|
+
},
|
|
34
|
+
}
|
|
35
|
+
response = requests.post(url, headers=headers, json=payload).json()
|
|
36
|
+
text = response["choices"][0]["message"]["content"]
|
|
37
|
+
a_text = find_between(text, "```csv", "```").strip()
|
|
38
|
+
if len(a_text) == 0:
|
|
39
|
+
a_text = find_between(text, "```", "```").strip()
|
|
40
|
+
f = StringIO()
|
|
41
|
+
f.write(a_text)
|
|
42
|
+
f.seek(0)
|
|
43
|
+
table = {}
|
|
44
|
+
dr = csv.reader(f, delimiter=',')
|
|
45
|
+
n = 0
|
|
46
|
+
for r in dr:
|
|
47
|
+
n += 1
|
|
48
|
+
if n == 1: continue
|
|
49
|
+
table[r[0]] = r[1]
|
|
50
|
+
return table
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_description(data, language='English'):
|
|
55
|
+
url = "https://api.perplexity.ai/chat/completions"
|
|
56
|
+
headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}"}
|
|
57
|
+
payload = {
|
|
58
|
+
"model": "sonar",
|
|
59
|
+
"messages": [
|
|
60
|
+
{"role": "system", "content": "Be precise and concise, provide data output only CSV or JSON, accrording to request"},
|
|
61
|
+
{"role": "user", "content": (
|
|
62
|
+
f"""
|
|
63
|
+
I have the following CSV data:
|
|
64
|
+
{data}
|
|
65
|
+
Please provide short description in {language} about this data in English. Consider this data as sample of the bigger dataset.Don't generate any code and data examples""")},
|
|
66
|
+
],
|
|
67
|
+
"response_format": {
|
|
68
|
+
"type": "text",
|
|
69
|
+
},
|
|
70
|
+
}
|
|
71
|
+
response = requests.post(url, headers=headers, json=payload).json()
|
|
72
|
+
return response["choices"][0]["message"]["content"]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
print(get_fields_info(sys.argv[1], sys.argv[2]))
|