undatum 1.0.14__tar.gz → 1.0.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of undatum might be problematic. Click here for more details.

Files changed (45) hide show
  1. {undatum-1.0.14/undatum.egg-info → undatum-1.0.15}/PKG-INFO +55 -7
  2. {undatum-1.0.14 → undatum-1.0.15}/README.rst +19 -1
  3. {undatum-1.0.14 → undatum-1.0.15}/setup.py +7 -1
  4. undatum-1.0.15/tests/test.py +12 -0
  5. {undatum-1.0.14 → undatum-1.0.15}/undatum/__init__.py +2 -1
  6. {undatum-1.0.14 → undatum-1.0.15}/undatum/__main__.py +4 -3
  7. undatum-1.0.15/undatum/ai/perplexity.py +78 -0
  8. undatum-1.0.15/undatum/cmds/analyzer.py +472 -0
  9. {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/converter.py +194 -21
  10. undatum-1.0.15/undatum/cmds/ingester.py +110 -0
  11. {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/query.py +3 -7
  12. undatum-1.0.15/undatum/cmds/schemer.py +274 -0
  13. {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/selector.py +63 -36
  14. {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/statistics.py +31 -17
  15. {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/textproc.py +20 -8
  16. undatum-1.0.15/undatum/cmds/transformer.py +78 -0
  17. {undatum-1.0.14 → undatum-1.0.15}/undatum/cmds/validator.py +3 -8
  18. undatum-1.0.15/undatum/common/__init__.py +1 -0
  19. {undatum-1.0.14 → undatum-1.0.15}/undatum/common/functions.py +2 -1
  20. {undatum-1.0.14 → undatum-1.0.15}/undatum/common/iterable.py +3 -3
  21. {undatum-1.0.14 → undatum-1.0.15}/undatum/common/scheme.py +1 -1
  22. {undatum-1.0.14 → undatum-1.0.15}/undatum/constants.py +3 -2
  23. undatum-1.0.15/undatum/core.py +350 -0
  24. undatum-1.0.15/undatum/formats/__init__.py +0 -0
  25. undatum-1.0.15/undatum/formats/docx.py +159 -0
  26. {undatum-1.0.14 → undatum-1.0.15}/undatum/utils.py +2 -34
  27. {undatum-1.0.14 → undatum-1.0.15}/undatum/validate/__init__.py +1 -0
  28. {undatum-1.0.14 → undatum-1.0.15}/undatum/validate/commonrules.py +2 -1
  29. {undatum-1.0.14 → undatum-1.0.15}/undatum/validate/ruscodes.py +1 -0
  30. {undatum-1.0.14 → undatum-1.0.15/undatum.egg-info}/PKG-INFO +55 -7
  31. {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/SOURCES.txt +6 -0
  32. {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/requires.txt +5 -0
  33. undatum-1.0.14/undatum/cmds/analyzer.py +0 -284
  34. undatum-1.0.14/undatum/cmds/schemer.py +0 -46
  35. undatum-1.0.14/undatum/cmds/transformer.py +0 -94
  36. undatum-1.0.14/undatum/core.py +0 -444
  37. {undatum-1.0.14 → undatum-1.0.15}/AUTHORS.rst +0 -0
  38. {undatum-1.0.14 → undatum-1.0.15}/LICENSE +0 -0
  39. {undatum-1.0.14 → undatum-1.0.15}/setup.cfg +0 -0
  40. {undatum-1.0.14/undatum/cmds → undatum-1.0.15/undatum/ai}/__init__.py +0 -0
  41. {undatum-1.0.14/undatum/common → undatum-1.0.15/undatum/cmds}/__init__.py +0 -0
  42. {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/dependency_links.txt +0 -0
  43. {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/entry_points.txt +0 -0
  44. {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/not-zip-safe +0 -0
  45. {undatum-1.0.14 → undatum-1.0.15}/undatum.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: undatum
3
- Version: 1.0.14
3
+ Version: 1.0.15
4
4
  Summary: undatum: a command-line tool for data processing. Brings CSV simplicity to JSON lines and BSON
5
5
  Home-page: https://github.com/datacoon/undatum/
6
6
  Download-URL: https://github.com/datacoon/undatum/
@@ -8,7 +8,6 @@ Author: Ivan Begtin
8
8
  Author-email: ivan@begtin.tech
9
9
  License: MIT
10
10
  Keywords: json jsonl csv bson cli dataset
11
- Platform: UNKNOWN
12
11
  Classifier: Development Status :: 5 - Production/Stable
13
12
  Classifier: Programming Language :: Python
14
13
  Classifier: Programming Language :: Python :: 3 :: Only
@@ -24,9 +23,42 @@ Classifier: Topic :: Text Processing
24
23
  Classifier: Topic :: Utilities
25
24
  Requires-Python: >=3.8
26
25
  Description-Content-Type: text/x-rst
27
- Provides-Extra: python_version == "3.8" or python_version == "3.8"
28
26
  License-File: LICENSE
29
27
  License-File: AUTHORS.rst
28
+ Requires-Dist: chardet>=3.0.4
29
+ Requires-Dist: click>=8.0.3
30
+ Requires-Dist: dictquery>=0.4.0
31
+ Requires-Dist: jsonlines>=1.2.0
32
+ Requires-Dist: openpyxl>=3.0.5
33
+ Requires-Dist: orjson>=3.6.6
34
+ Requires-Dist: pandas>=1.1.3
35
+ Requires-Dist: pymongo>=3.11.0
36
+ Requires-Dist: qddate>=0.1.1
37
+ Requires-Dist: tabulate>=0.8.7
38
+ Requires-Dist: validators>=0.18.1
39
+ Requires-Dist: xlrd>=1.2.0
40
+ Requires-Dist: xmltodict
41
+ Requires-Dist: rich
42
+ Requires-Dist: duckdb
43
+ Requires-Dist: pyzstd
44
+ Requires-Dist: pydantic
45
+ Requires-Dist: typer
46
+ Provides-Extra: python-version-3-8-or-python-version-3-8
47
+ Requires-Dist: argparse>=1.2.1; extra == "python-version-3-8-or-python-version-3-8"
48
+ Dynamic: author
49
+ Dynamic: author-email
50
+ Dynamic: classifier
51
+ Dynamic: description
52
+ Dynamic: description-content-type
53
+ Dynamic: download-url
54
+ Dynamic: home-page
55
+ Dynamic: keywords
56
+ Dynamic: license
57
+ Dynamic: license-file
58
+ Dynamic: provides-extra
59
+ Dynamic: requires-dist
60
+ Dynamic: requires-python
61
+ Dynamic: summary
30
62
 
31
63
  ==================================================
32
64
  undatum -- a command-line tool for data processing
@@ -52,7 +84,7 @@ Main features
52
84
  * Common data operations against CSV, JSON lines and BSON files
53
85
  * Built-in data filtering
54
86
  * Support data compressed with ZIP, XZ, GZ, BZ2
55
- * Conversion between CSV, JSONl, BSON, XML, XLS, XLSX, Parquet file types
87
+ * Conversion between CSV, JSONl, BSON, XML, XLS, XLSX, Parquet, AVRO and ORC file types
56
88
  * Low memory footprint
57
89
  * Support for compressed datasets
58
90
  * Advanced statistics calculations
@@ -278,6 +310,24 @@ Converts CSV file feddomains.csv to Parquet file feddomains.parquet
278
310
  $ undatum convert examples/feddomains.csv examples/feddomains.parquet
279
311
 
280
312
 
313
+ *Data formats conversion table map*
314
+
315
+ ============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
316
+ From / To CSV JSONlines BSON JSON XLS XLSX XML Parquet ORC AVRO
317
+ ============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
318
+ CSV - Yes Yes No No No No Yes Yes Yes
319
+ JSONlines Yes - No No No No No Yes Yes No
320
+ BSON No Yes - No No No No No No No
321
+ JSON No Yes No - No No No No No No
322
+ XLS No Yes Yes No - No No No No No
323
+ XLSX No Yes Yes No No - No No No No
324
+ XML No Yes No No No No - No No No
325
+ Parquet No No No No No No No - No No
326
+ ORC No No No No No No No No - No
327
+ AVRO No No No No No No No No No -
328
+ ============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
329
+
330
+
281
331
  Validate command
282
332
  ----------------
283
333
 
@@ -483,5 +533,3 @@ JSONl
483
533
  -----
484
534
 
485
535
  JSON lines is a replacement to CSV and JSON files, with JSON flexibility and ability to process data line by line, without loading everything into memory.
486
-
487
-
@@ -22,7 +22,7 @@ Main features
22
22
  * Common data operations against CSV, JSON lines and BSON files
23
23
  * Built-in data filtering
24
24
  * Support data compressed with ZIP, XZ, GZ, BZ2
25
- * Conversion between CSV, JSONl, BSON, XML, XLS, XLSX, Parquet file types
25
+ * Conversion between CSV, JSONl, BSON, XML, XLS, XLSX, Parquet, AVRO and ORC file types
26
26
  * Low memory footprint
27
27
  * Support for compressed datasets
28
28
  * Advanced statistics calculations
@@ -248,6 +248,24 @@ Converts CSV file feddomains.csv to Parquet file feddomains.parquet
248
248
  $ undatum convert examples/feddomains.csv examples/feddomains.parquet
249
249
 
250
250
 
251
+ *Data formats conversion table map*
252
+
253
+ ============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
254
+ From / To CSV JSONlines BSON JSON XLS XLSX XML Parquet ORC AVRO
255
+ ============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
256
+ CSV - Yes Yes No No No No Yes Yes Yes
257
+ JSONlines Yes - No No No No No Yes Yes No
258
+ BSON No Yes - No No No No No No No
259
+ JSON No Yes No - No No No No No No
260
+ XLS No Yes Yes No - No No No No No
261
+ XLSX No Yes Yes No No - No No No No
262
+ XML No Yes No No No No - No No No
263
+ Parquet No No No No No No No - No No
264
+ ORC No No No No No No No No - No
265
+ AVRO No No No No No No No No No -
266
+ ============ ====== ============ ======= ======= ====== ======= ====== ========== ====== =======
267
+
268
+
251
269
  Validate command
252
270
  ----------------
253
271
 
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf8 -*-
1
2
  # This is purely the result of trial and error.
2
3
 
3
4
  import sys
@@ -46,7 +47,12 @@ install_requires = [
46
47
  'tabulate>=0.8.7',
47
48
  'validators>=0.18.1',
48
49
  'xlrd>=1.2.0',
49
- 'xmltodict'
50
+ 'xmltodict',
51
+ 'rich',
52
+ 'duckdb',
53
+ 'pyzstd',
54
+ 'pydantic',
55
+ 'typer'
50
56
  ]
51
57
 
52
58
 
@@ -0,0 +1,12 @@
1
+ import pandas as pd
2
+ from undatum.cmds.analyzer import duckdb_decompose
3
+
4
+ DATA = [
5
+ {"foo": 1, "bar": "some string", "baz": 1.23},
6
+ {"foo": 2, "bar": "some other string", "baz": 2.34},
7
+ {"foo": 3, "bar": "yet another string", "baz": 3.45},
8
+ ]
9
+
10
+ df = pd.DataFrame(DATA)
11
+
12
+ print(duckdb_decompose(frame=df))
@@ -1,8 +1,9 @@
1
+ # -*- coding: utf8 -*-
1
2
  """
2
3
  undatum: a command-line tool for data processing. Brings CSV simplicity to JSON lines and BSON
3
4
 
4
5
  """
5
6
 
6
- __version__ = "1.0.14"
7
+ __version__ = "1.0.15"
7
8
  __author__ = 'Ivan Begtin'
8
9
  __licence__ = 'MIT'
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf8 -*-
1
2
  #!/usr/bin/env python
2
3
  """The main entry point. Invoke as `undatum' or `python -m undatum`.
3
4
 
@@ -7,12 +8,12 @@ import sys
7
8
 
8
9
  def main():
9
10
  try:
10
- from .core import cli
11
- exit_status = cli()
11
+ from .core import app
12
+ app()
12
13
  except KeyboardInterrupt:
13
14
  print("Ctrl-C pressed. Aborting")
14
15
  sys.exit(0)
15
16
 
16
17
 
17
18
  if __name__ == '__main__':
18
- main()
19
+ app()
@@ -0,0 +1,78 @@
1
+ import requests
2
+ import csv
3
+ import sys
4
+ import os
5
+ from io import StringIO
6
+
7
+ PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY', )
8
+
9
+
10
+ def find_between( s, first, last ):
11
+ try:
12
+ start = s.index( first ) + len( first )
13
+ end = s.index( last, start )
14
+ return s[start:end]
15
+ except ValueError:
16
+ return ""
17
+
18
+ def get_fields_info(fields, language='English'):
19
+ """Returns information about data fields"""
20
+ url = "https://api.perplexity.ai/chat/completions"
21
+ headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}"}
22
+ payload = {
23
+ "model": "sonar",
24
+ "messages": [
25
+ {"role": "system", "content": "Be precise and concise, provide data output only CSV or JSON, accrording to request"},
26
+ {"role": "user", "content": (
27
+ f"Please describe in {language} these fields delimited by comma: {fields}"
28
+ "Please output as single csv table only with following fields: name and description"
29
+ )},
30
+ ],
31
+ "response_format": {
32
+ "type": "text",
33
+ },
34
+ }
35
+ response = requests.post(url, headers=headers, json=payload).json()
36
+ text = response["choices"][0]["message"]["content"]
37
+ a_text = find_between(text, "```csv", "```").strip()
38
+ if len(a_text) == 0:
39
+ a_text = find_between(text, "```", "```").strip()
40
+ f = StringIO()
41
+ f.write(a_text)
42
+ f.seek(0)
43
+ table = {}
44
+ dr = csv.reader(f, delimiter=',')
45
+ n = 0
46
+ for r in dr:
47
+ n += 1
48
+ if n == 1: continue
49
+ table[r[0]] = r[1]
50
+ return table
51
+
52
+
53
+
54
+ def get_description(data, language='English'):
55
+ url = "https://api.perplexity.ai/chat/completions"
56
+ headers = {"Authorization": f"Bearer {PERPLEXITY_API_KEY}"}
57
+ payload = {
58
+ "model": "sonar",
59
+ "messages": [
60
+ {"role": "system", "content": "Be precise and concise, provide data output only CSV or JSON, accrording to request"},
61
+ {"role": "user", "content": (
62
+ f"""
63
+ I have the following CSV data:
64
+ {data}
65
+ Please provide short description in {language} about this data in English. Consider this data as sample of the bigger dataset.Don't generate any code and data examples""")},
66
+ ],
67
+ "response_format": {
68
+ "type": "text",
69
+ },
70
+ }
71
+ response = requests.post(url, headers=headers, json=payload).json()
72
+ return response["choices"][0]["message"]["content"]
73
+
74
+
75
+
76
+
77
+ if __name__ == "__main__":
78
+ print(get_fields_info(sys.argv[1], sys.argv[2]))