python-jack-knife 0.6.0__tar.gz → 0.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/PKG-INFO +1 -1
  2. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/common.py +19 -33
  3. python_jack_knife-0.6.3/src/pjk/integrations/opensearch_client.py +77 -0
  4. python_jack_knife-0.6.3/src/pjk/integrations/opensearch_query_pipe.py +104 -0
  5. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/integrations/postgres_pipe.py +11 -12
  6. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/integrations/snowflake_pipe.py +14 -31
  7. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/main.py +13 -2
  8. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/parser.py +104 -69
  9. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/denorm.py +6 -3
  10. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/factory.py +3 -1
  11. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/move_field.py +9 -10
  12. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/query_pipe.py +4 -13
  13. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/version.py +1 -1
  14. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/python_jack_knife.egg-info/PKG-INFO +1 -1
  15. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/python_jack_knife.egg-info/SOURCES.txt +2 -0
  16. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/LICENSE +0 -0
  17. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/README.md +0 -0
  18. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/pyproject.toml +0 -0
  19. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/setup.cfg +0 -0
  20. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/__init__.py +0 -0
  21. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/base.py +0 -0
  22. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/integrations/ddb_sink.py +0 -0
  23. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/log.py +0 -0
  24. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/man_page.py +0 -0
  25. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/__init__.py +0 -0
  26. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/filter.py +0 -0
  27. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/head.py +0 -0
  28. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/join.py +0 -0
  29. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/let_reduce.py +0 -0
  30. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/map.py +0 -0
  31. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/progress_pipe.py +0 -0
  32. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/remove_field.py +0 -0
  33. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/sample.py +0 -0
  34. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/select.py +0 -0
  35. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/sort.py +0 -0
  36. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/tail.py +0 -0
  37. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/user_pipe_factory.py +0 -0
  38. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/pipes/where.py +0 -0
  39. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/progress.py +0 -0
  40. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/registry.py +0 -0
  41. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/__init__.py +0 -0
  42. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/create_sink.py +0 -0
  43. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/csv_sink.py +0 -0
  44. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/devnull.py +0 -0
  45. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/dir_sink.py +0 -0
  46. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/expect.py +0 -0
  47. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/factory.py +0 -0
  48. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/format_sink.py +0 -0
  49. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/graph.py +0 -0
  50. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/graph_bar_line.py +0 -0
  51. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/graph_cumulative.py +0 -0
  52. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/graph_hist.py +0 -0
  53. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/graph_scatter.py +0 -0
  54. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/json_sink.py +0 -0
  55. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/s3_sink.py +0 -0
  56. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/s3_stream.py +0 -0
  57. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/sinks.py +0 -0
  58. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/stdout.py +0 -0
  59. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/tsv_sink.py +0 -0
  60. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sinks/user_sink_factory.py +0 -0
  61. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/__init__.py +0 -0
  62. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/csv_source.py +0 -0
  63. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/dir_source.py +0 -0
  64. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/factory.py +0 -0
  65. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/format_source.py +0 -0
  66. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/inline_source.py +0 -0
  67. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/json_source.py +0 -0
  68. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/lazy_file.py +0 -0
  69. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/lazy_file_local.py +0 -0
  70. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/lazy_file_s3.py +0 -0
  71. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/npy_source.py +0 -0
  72. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/parquet_source.py +0 -0
  73. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/s3_source.py +0 -0
  74. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/source_list.py +0 -0
  75. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/sql_source.py +0 -0
  76. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/tsv_source.py +0 -0
  77. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/pjk/sources/user_source_factory.py +0 -0
  78. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/python_jack_knife.egg-info/dependency_links.txt +0 -0
  79. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/python_jack_knife.egg-info/entry_points.txt +0 -0
  80. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/python_jack_knife.egg-info/requires.txt +0 -0
  81. {python_jack_knife-0.6.0 → python_jack_knife-0.6.3}/src/python_jack_knife.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-jack-knife
3
- Version: 0.6.0
3
+ Version: 0.6.3
4
4
  Summary: Python Jack Knife – a command line data processor
5
5
  Author-email: Mike Schultz <mike.schultz@gmail.com>
6
6
  License:
@@ -3,8 +3,9 @@
3
3
 
4
4
  import sys, shutil, subprocess, contextlib, signal
5
5
  import os
6
+ import re
6
7
  import yaml
7
- from pjk.base import TokenError, Integration
8
+ from pjk.base import TokenError, Integration, Source, Pipe
8
9
 
9
10
  class SafeNamespace:
10
11
  def __init__(self, obj):
@@ -73,50 +74,32 @@ def highlight(text: str, color: str = 'bold', value: str = None) -> str:
73
74
  style = COLOR_CODES.get(color.lower(), COLOR_CODES['bold'])
74
75
  return text.replace(value, f"{style}{value}{RESET}")
75
76
 
76
- class Lookups:
77
- def __init__(self, component_class):
78
- self.lookups_yaml = os.path.expanduser('~/.pjk/lookups.yaml')
77
+ class Config:
78
+ def __init__(self, instance_type: str, component_class: Source|Pipe, instance: str):
79
+ # instance = name of the instance, e.g. 'myindexcollection', instance_type = 'index'
80
+ # instance_type only used by automatic config template maker MUST BE STRING LITERAL!
81
+ self.configs_yaml = os.path.expanduser('~/.pjk/component_configs.yaml')
79
82
  self.class_name = type(component_class).__name__
83
+ self.instance = instance
80
84
  self._data = {}
81
85
  self._load()
82
86
 
83
87
  def _load(self):
84
- """Load lookups from YAML file if it exists."""
85
- if os.path.exists(self.lookups_yaml):
86
- with open(self.lookups_yaml, 'r') as f:
88
+ if os.path.exists(self.configs_yaml):
89
+ with open(self.configs_yaml, 'r') as f:
87
90
  self._data = yaml.safe_load(f) or {}
88
91
  else:
89
92
  self._data = {}
90
93
 
91
- def save(self):
92
- """Save current lookups back to YAML file."""
93
- os.makedirs(os.path.dirname(self.lookups_yaml), exist_ok=True)
94
- with open(self.lookups_yaml, 'w') as f:
95
- yaml.safe_dump(self._data, f)
96
-
97
- def get(self, key, default=None):
98
- lookup_key = f'{self.class_name}-{key}'
99
- entry = self._data.get(lookup_key, default)
94
+ def lookup(self, param: str, default=None):
95
+ instance_key = f'{self.class_name}-{self.instance}'
96
+ entry = self._data.get(instance_key, None)
100
97
  if not entry:
101
98
  raise TokenError(
102
- f"~/.pjk/lookups.yaml does not contain entry for '{lookup_key}' with required params."
99
+ f"~/.pjk/component_configs.yaml does not contain entry for '{instance_key}' with required params."
103
100
  )
104
- return entry
105
-
106
- def set(self, key, value):
107
- """Set a lookup value and persist it."""
108
- self._data[key] = value
109
- self.save()
110
-
111
- def delete(self, key):
112
- """Remove a key if it exists and save."""
113
- if key in self._data:
114
- del self._data[key]
115
- self.save()
116
-
117
- def all(self):
118
- """Return the full lookup dictionary."""
119
- return dict(self._data)
101
+
102
+ return entry.get(param, default)
120
103
 
121
104
  class ComponentFactory:
122
105
  def __init__(self, core_components: dict):
@@ -156,3 +139,6 @@ class ComponentFactory:
156
139
 
157
140
  def create(self, token: str):
158
141
  pass
142
+
143
+ def is_valid_field_name(name: str):
144
+ return re.fullmatch(r'^[A-Za-z_][A-Za-z0-9_]*$', name)
@@ -0,0 +1,77 @@
1
+ from pjk.common import Config
2
+
3
+ class OpenSearchClient:
4
+
5
+ @classmethod
6
+ def get_client(cls, config: Config):
7
+ aws_auth = config.lookup("os_auth_use_aws", "true") != "false"
8
+ scheme = config.lookup("os_scheme", "https")
9
+ verify_certs = config.lookup("os_verify_certs", "true") != "false"
10
+ ca_certs = config.lookup("os_ca_certs", None)
11
+ region = config.lookup("os_region", None)
12
+ service = config.lookup("os_service", "es")
13
+ username = config.lookup("os_username", None)
14
+ password = config.lookup("os_password", None)
15
+ timeout = float(config.lookup("os_timeout", 30))
16
+ ssl_assert_hostname = config.lookup("os_ssl_assert_hostname", "true") != "false"
17
+ ssl_show_warn = config.lookup("os_ssl_show_warn", "false") == "true"
18
+ host = config.lookup("os_host", None)
19
+ port = config.lookup("os_port", None)
20
+
21
+ # Reasonable port defaults
22
+ if port is None:
23
+ port = 443 if scheme == "https" else 9200
24
+ else:
25
+ port = int(port)
26
+
27
+ if host is None:
28
+ raise ValueError("Config os_host is required (set os_host + os_port/os_scheme, or a connection profile).")
29
+
30
+ # Lazy import so this module can still be imported if deps aren't installed.
31
+ try:
32
+ from opensearchpy import OpenSearch, RequestsHttpConnection, Urllib3HttpConnection
33
+ except Exception as e:
34
+ raise RuntimeError("opensearch-py must be installed to use OpenSearchQueryPipe") from e
35
+
36
+ http_auth = None
37
+ connection_class = Urllib3HttpConnection # default
38
+ use_ssl = (scheme == "https")
39
+
40
+ if aws_auth:
41
+ # AWS SigV4 (works for OpenSearch Service / legacy ES domains)
42
+ try:
43
+ import boto3
44
+ from requests_aws4auth import AWS4Auth
45
+ except Exception as e:
46
+ raise RuntimeError("boto3 and requests-aws4auth are required for os_auth_method='aws'") from e
47
+
48
+ if not region:
49
+ raise ValueError("Config os_region is required for os_auth_method='aws'.")
50
+
51
+ session = boto3.Session()
52
+ credentials = session.get_credentials()
53
+ if credentials is None:
54
+ raise RuntimeError("No AWS credentials found (boto3 session.get_credentials() returned None).")
55
+
56
+ creds = credentials.get_frozen_credentials()
57
+ http_auth = AWS4Auth(creds.access_key, creds.secret_key, region, service, session_token=creds.token)
58
+ connection_class = RequestsHttpConnection # SigV4 signing via requests path
59
+
60
+ else:
61
+ if not (username and password):
62
+ raise ValueError("os_username and os_password are required for os_auth_method='basic'.")
63
+ http_auth = (username, password)
64
+
65
+ # Build client
66
+ client = OpenSearch(
67
+ hosts=[{"host": host, "port": port}],
68
+ http_auth=http_auth,
69
+ use_ssl=use_ssl,
70
+ verify_certs=verify_certs,
71
+ ssl_assert_hostname=ssl_assert_hostname,
72
+ ssl_show_warn=ssl_show_warn,
73
+ ca_certs=ca_certs,
74
+ timeout=timeout,
75
+ connection_class=connection_class,
76
+ )
77
+ return client
@@ -0,0 +1,104 @@
1
+ import os
2
+ import sys
3
+ import traceback
4
+ from copy import deepcopy
5
+ from typing import Optional, Iterator, Dict, Any, Iterable
6
+
7
+ from pjk.base import Pipe, ParsedToken, Usage, Integration
8
+ from pjk.pipes.query_pipe import QueryPipe
9
+ from pjk.common import Config
10
+ from pjk.integrations.opensearch_client import OpenSearchClient
11
+
12
+ def build_body_from_string(query_string: str) -> dict:
13
+ if query_string == "*":
14
+ return {"query": {"match_all": {}}}
15
+ else:
16
+ return {
17
+ "query": {
18
+ "simple_query_string": {
19
+ "query": query_string
20
+ }
21
+ }
22
+ }
23
+
24
+ class OpenSearchQueryPipe(QueryPipe, Integration):
25
+ name = "os_query"
26
+ desc = "Opensearch query pipe. Uses record['query_string'] or record['query_object'] for os query"
27
+ arg0 = ("index", "index to query over")
28
+ examples = [
29
+ ["{'query_string': '*'}", 'os_query:myindex', '-'],
30
+ ["{'query_string': 'dog'}", 'os_query:myindex', '-'],
31
+ ["{'query_string': 'dog AND cat'}", 'os_query:myindex', '-'],
32
+ ["{'query_object': {query: {...}}", 'os_query:myindex', '-'],
33
+ ]
34
+
35
+ def __init__(self, ptok: ParsedToken, usage: Usage):
36
+ super().__init__(ptok, usage)
37
+
38
+ # index from arg0 or config
39
+ self.index = ptok.get_arg(0)
40
+
41
+ # Build the OpenSearch client (handles AWS/basic/none)
42
+ config = Config('index', self, self.index)
43
+ self.client = OpenSearchClient.get_client(config)
44
+
45
+ # Iteration state
46
+ self.cur_record: Optional[Dict[str, Any]] = None
47
+ self.hits_iter: Optional[Iterator[Dict[str, Any]]] = None
48
+
49
+ def reset(self):
50
+ # keep the index open between drains
51
+ pass
52
+
53
+ def close(self):
54
+ pass
55
+
56
+ def execute_query_returning_Q_xR_iterable(self, query_record: dict) -> Iterator[Dict[str, Any]]:
57
+ query_string = query_record.get('query_string', None)
58
+ query_body = None
59
+
60
+ if query_string:
61
+ query_body = build_body_from_string(query_string)
62
+ else:
63
+ query_body = query_record.get('query_object')
64
+
65
+ try:
66
+ # Build final request body
67
+ req_body = deepcopy(query_body)
68
+ req_body["size"] = self.count
69
+
70
+ res = self.client.search(index=self.index, body=req_body)
71
+
72
+ total_hits = 0
73
+ took = res.get("took")
74
+ hits = res.get("hits", {}).get("hits", [])
75
+ total_obj = res.get("hits", {}).get("total", {})
76
+ if isinstance(total_obj, dict):
77
+ total_hits = total_obj.get("value", 0)
78
+ elif isinstance(total_obj, int):
79
+ total_hits = total_obj
80
+
81
+ # Emit a metadata record first
82
+ yield {
83
+ "took_ms": took,
84
+ "total_hits": total_hits,
85
+ "index": self.index,
86
+ "os_query_body": req_body
87
+ }
88
+
89
+ # Emit each hit
90
+ for hit in hits:
91
+ if "_source" in hit and isinstance(hit["_source"], dict):
92
+ yield hit["_source"]
93
+ else:
94
+ # Some queries (e.g., stored fields only) might not include _source
95
+ yield {"_type": "os_query_hit", "_hit": hit}
96
+
97
+ except Exception as e:
98
+ print("OpenSearch query error:", e, file=sys.stderr)
99
+ traceback.print_exc()
100
+ yield {
101
+ "_type": "os_query_error",
102
+ "error": str(e),
103
+ "query_record": query_record,
104
+ }
@@ -10,7 +10,7 @@ from decimal import Decimal
10
10
  from typing import Any, Dict, Optional
11
11
 
12
12
  from pjk.base import Integration, ParsedToken, Usage
13
- from pjk.common import Lookups
13
+ from pjk.common import Config
14
14
  from pjk.pipes.query_pipe import QueryPipe
15
15
 
16
16
 
@@ -91,26 +91,25 @@ def _row_to_dict(cursor, row) -> Dict[str, Any]:
91
91
 
92
92
 
93
93
  class PostgresPipe(QueryPipe,Integration):
94
- name = 'pgres'
94
+ name = 'postgres'
95
95
  desc = "Postgres query pipe; executes SQL from input."
96
96
  arg0 = ("dbname", 'database name.')
97
97
  examples = [
98
- ['myquery.sql', 'pgres:mydb', '-'],
99
- ["{'query': 'SELECT * from MY_TABLE;'}", 'pgres:mydb', '-'],
100
- ["{'query': 'SELECT * FROM pg_catalog.pg_tables;'}", 'pgres:mydb']
98
+ ['myquery.sql', 'postgres:mydb', '-'],
99
+ ["{'query': 'SELECT * from MY_TABLE;'}", 'postgres:mydb', '-'],
100
+ ["{'query': 'SELECT * FROM pg_catalog.pg_tables;'}", 'postgres:mydb']
101
101
  ]
102
102
 
103
103
  def __init__(self, ptok: ParsedToken, usage: Usage):
104
104
  super().__init__(ptok, usage)
105
105
 
106
- lookups = Lookups(self)
107
106
  self.dbname = usage.get_arg("dbname")
108
-
109
- self.db_host = self.lookup_params.get("host")
110
- self.db_user = self.lookup_params.get("user")
111
- self.db_pass = self.lookup_params.get("password")
112
- self.db_port = int(self.lookup_params.get("port", 5432))
113
- self.db_ssl = bool(self.lookup_params.get("ssl", False))
107
+ config = Config('dbname', self, self.dbname)
108
+ self.db_host = config.lookup("host")
109
+ self.db_user = config.lookup("user")
110
+ self.db_pass = config.lookup("password")
111
+ self.db_port = int(config.lookup("port", 5432))
112
+ self.db_ssl = bool(config.lookup("ssl", False))
114
113
 
115
114
  self.query_field = usage.get_param('query_field')
116
115
  self.params_field = "params" # optional: list/tuple (positional) or dict (named)
@@ -9,8 +9,9 @@ import uuid
9
9
  from decimal import Decimal
10
10
  from typing import Any, Dict, Optional
11
11
 
12
- from pjk.base import ParsedToken, Usage, TokenError, Integration
12
+ from pjk.base import ParsedToken, TokenError, Usage, Integration
13
13
  from pjk.pipes.query_pipe import QueryPipe
14
+ from pjk.common import Config
14
15
 
15
16
 
16
17
  # ---------- utilities ----------
@@ -55,23 +56,6 @@ def _row_to_dict(cursor, row) -> Dict[str, Any]:
55
56
  return {col: normalize(val) for col, val in zip(cols, row)}
56
57
 
57
58
 
58
- def _get_any(params: Dict[str, Any], *keys: str) -> Optional[Any]:
59
- """
60
- Fetch a value from params using any of the provided keys,
61
- trying case variants and optional SNOWFLAKE_ prefix.
62
- """
63
- variants = []
64
- for k in keys:
65
- variants.extend([
66
- k, k.lower(), k.upper(),
67
- f"snowflake_{k}".lower(), f"SNOWFLAKE_{k}".upper()
68
- ])
69
- for v in variants:
70
- if v in params:
71
- return params[v]
72
- return None
73
-
74
-
75
59
  # ---------- client ----------
76
60
 
77
61
  class SnowflakeClient:
@@ -141,29 +125,28 @@ class SnowflakeClient:
141
125
  class SnowflakePipe(QueryPipe, Integration):
142
126
  """
143
127
  Snowflake query pipe; executes SQL found in input record['query'] and streams rows.
144
- Connection/session settings are pulled from ~/.pjk/lookups.yaml under the arg name.
128
+ Connection/session settings are pulled from ~/.pjk/component_configs.yaml under the arg name.
145
129
  """
146
130
  name = 'snowflake'
147
131
  desc = "Snowflake query pipe; executes an SQL query for each input record."
148
132
  arg0 = ('dbname', 'database name.')
149
133
  examples = [
150
- ["{'query': 'SELECT CURRENT_ROLE();'}", "snow:EDLDB", "-"],
151
- ["myquery.sql", "snow:EDLDB", "-"]
134
+ ["{'query': 'SELECT CURRENT_ROLE();'}", "snowflake:EDLDB", "-"],
135
+ ["myquery.sql", "snowflake:EDLDB", "-"]
152
136
  ]
153
137
 
154
138
  def __init__(self, ptok: ParsedToken, usage: Usage):
155
139
  super().__init__(ptok, usage)
156
140
 
157
- self.dbname = usage.get_arg(type(self).arg0[0])
158
-
159
- # Accept both bare keys and SNOWFLAKE_* variants in lookups.yaml
160
- self.sf_account = _get_any(self.lookup_params, "account")
161
- self.sf_user = _get_any(self.lookup_params, "user")
162
- self.sf_auth = _get_any(self.lookup_params, "authenticator")
163
- self.sf_role = _get_any(self.lookup_params, "role")
164
- self.sf_wh = _get_any(self.lookup_params, "warehouse")
141
+ self.dbname = usage.get_arg('dbname')
142
+ config = Config('dbname', self, self.dbname)
143
+ self.sf_account = config.lookup("account")
144
+ self.sf_user = config.lookup("user")
145
+ self.sf_auth = config.lookup("authenticator")
146
+ self.sf_role = config.lookup("role")
147
+ self.sf_wh = config.lookup("warehouse")
148
+ self.sf_schema = config.lookup("schema")
165
149
  self.sf_db = self.dbname
166
- self.sf_schema = _get_any(self.lookup_params, "schema")
167
150
 
168
151
  # Basic validation
169
152
  missing = [k for k, v in [
@@ -176,7 +159,7 @@ class SnowflakePipe(QueryPipe, Integration):
176
159
  ] if not v]
177
160
  if missing:
178
161
  raise TokenError(
179
- f"lookups entry '{self.dbname}' missing: {', '.join(missing)}"
162
+ f"config entry '{self.dbname}' missing: {', '.join(missing)}"
180
163
  )
181
164
 
182
165
  self.query_field = usage.get_param('query_field')
@@ -11,6 +11,8 @@ from pjk.parser import ExpressionParser
11
11
  from pjk.base import UsageError
12
12
  from pjk.log import init as init_logging
13
13
  from datetime import datetime
14
+ from pathlib import Path
15
+ import shutil
14
16
  import traceback
15
17
  import concurrent.futures
16
18
  from pjk.registry import ComponentRegistry
@@ -65,13 +67,22 @@ def execute_threaded(sinks, stop_progress=None):
65
67
  else:
66
68
  executor.shutdown(wait=True)
67
69
 
70
+ def initialize():
71
+ init_logging()
72
+
73
+ '''
74
+ src = Path("src/pjk/resources/component_configs.tmpl")
75
+ dst_dir = Path.home() / ".pjk"
76
+ dst_dir.mkdir(parents=True, exist_ok=True)
77
+ shutil.copy(src, dst_dir / src.name)
78
+ '''
79
+
68
80
  def execute(command: str):
69
81
  tokens = shlex.split(command, comments=True, posix=True)
70
82
  execute_tokens(tokens)
71
83
 
72
84
  def execute_tokens(tokens: List[str]):
73
- init_logging()
74
- # (remove the sys.exit SIGINT handler here)
85
+ initialize()
75
86
 
76
87
  if '--version' in tokens:
77
88
  print(f"pjk version {__version__}")
@@ -55,8 +55,20 @@ class OperandStack:
55
55
  return None
56
56
  return self.stack[-1]
57
57
 
58
+ def clear(self):
59
+ self.stack.clear()
60
+
58
61
  def empty(self):
59
62
  return len(self.stack) == 0
63
+
64
+ def print(self, toadd):
65
+ print('---------')
66
+ if toadd:
67
+ print(f'{type(toadd).__name__}={id(toadd)}')
68
+ if len(self.stack) == 0:
69
+ print(f'Stack={id(self)} StackEmpty')
70
+ for op in self.stack:
71
+ print(f'Stack={id(self)} {type(op).__name__}={id(op)}')
60
72
 
61
73
  class ExpressionParser:
62
74
  def __init__(self, registry: ComponentRegistry):
@@ -69,6 +81,9 @@ class ExpressionParser:
69
81
  'pjk <source> [<pipe> ...] <sink>'])
70
82
 
71
83
  source = self.stack.pop()
84
+ if isinstance(source, SubExpression):
85
+ raise TokenError("Poorly formed sub-expression. Begin token '[' without matching 'over' keyword." )
86
+
72
87
  if not self.stack.empty():
73
88
  raise TokenError.from_list(['A sink can only consume one source.',
74
89
  'pjk <source> [<pipe> ...] <sink>'])
@@ -167,29 +182,38 @@ class StackLoader:
167
182
  return ReducerAggregatorPipe(top_level_reducers=self.top_level_reducers)
168
183
 
169
184
  def add_operator(self, op, stack: OperandStack):
185
+ #stack.print(op)
186
+
170
187
  if not stack.empty() and isinstance(stack.peek(), SubExpression):
171
- top = stack.peek()
188
+ subexp = stack.peek()
172
189
 
173
- if isinstance(op, SubExpressionOver):
174
- subexp_begin = stack.pop()
175
- subexp_begin.set_over_arg(op.get_over_arg())
176
- op.add_source(subexp_begin)
190
+ if isinstance(op, SubExpressionOver) and subexp.recursion_depth() == 0:
191
+ subexp = stack.pop()
192
+ op.add_source(subexp)
177
193
  stack.push(op)
178
194
 
179
195
  global stack_level
180
- # SEEMS LIKE A HACK! FIXME. The stack should handle this but its off by one
181
- stack_level-=1
196
+ stack_level -=1 # not sure why this can't be handled exclusively by the stack
182
197
  return
198
+
183
199
  else: # an operator within the subexpression
184
- top.add_subop(op)
200
+ subexp = stack.peek()
201
+ subexp.add_subop(op)
185
202
  return
186
203
 
204
+ if isinstance(op, SubExpressionOver):
205
+ if stack.empty or not isinstance(stack.peek(), SubExpression):
206
+ raise TokenError("Poorly formed sub-expression. 'over' keyword without matching begin token '['.")
207
+ op.add_source(stack.pop())
208
+ stack.push(op)
209
+
187
210
  # order matters, because sources are pipes
188
211
  if isinstance(op, Pipe):
189
212
  arity = op.arity # class level attribute
190
213
  for _ in range(arity):
191
214
  if stack.empty():
192
- raise UsageError(f"'{op}' requires {arity} input(s)")
215
+ name = type(op).usage().name
216
+ raise TokenError(f"'{name}' requires {arity} input(s)")
193
217
  op.add_source(stack.pop())
194
218
  stack.push(op)
195
219
 
@@ -243,6 +267,31 @@ class UpstreamSource(Source):
243
267
  self.sub_recs_in.increment()
244
268
  yield item
245
269
 
270
+ class SubExpressionOver(Pipe):
271
+ @classmethod
272
+ def usage(cls) -> Usage:
273
+ u = Usage(
274
+ name="over",
275
+ desc="sub-expression over.",
276
+ component_class=cls,
277
+ )
278
+ return u
279
+
280
+ def __init__(self, ptok: ParsedToken, usage: Usage):
281
+ super().__init__(ptok, usage)
282
+ self.over_arg = ptok.get_arg(0)
283
+
284
+ def reset(self):
285
+ pass # stateless
286
+
287
+ def __iter__(self):
288
+ if not isinstance(self.left, SubExpression):
289
+ raise Exception('this actually cannot happen, but did')
290
+
291
+ for record in self.left:
292
+ self.left.subexp_process(record, self.over_arg)
293
+ yield record
294
+
246
295
  class SubExpression(Pipe, ProgressIgnore):
247
296
  @classmethod
248
297
  def create(cls, token: str) -> Pipe:
@@ -255,28 +304,34 @@ class SubExpression(Pipe, ProgressIgnore):
255
304
 
256
305
  def __init__(self, ptok: ParsedToken, usage: Usage):
257
306
  super().__init__(ptok)
258
- self.over_arg = None
259
- self.over_field = None
260
307
  self.subexp_ops = []
261
- self.over_pipe = None
262
308
  self.stack_helper = StackLoader()
263
309
  self.subexp_stack = OperandStack()
264
310
  self.upstream_source = UpstreamSource()
265
311
  self.subexp_stack.push(self.upstream_source)
312
+ self.recursions = 0 # number of subexpression within
313
+ self.subexp_left = None
266
314
 
267
315
  def add_subop(self, op):
268
316
  self.subexp_ops.append(op)
317
+ if isinstance(op, SubExpression):
318
+ self.recursions += 1
319
+ elif isinstance(op, SubExpressionOver):
320
+ self.recursions -= 1
269
321
  self.stack_helper.add_operator(op, self.subexp_stack)
270
322
 
271
- def set_over_arg(self, over_arg): #FIXME, this should take QueryPipe
272
- self.over_arg = over_arg
273
- if over_arg.endswith('.py'):
274
- self.over_field = 'child'
275
- self.over_pipe = UserPipeFactory.create(over_arg)
276
- self.upstream_source.set_source(self.over_pipe)
277
- self.subexp_ops.append(self.over_pipe)
278
- else:
279
- self.over_field = over_arg
323
+ def recursion_depth(self):
324
+ return self.recursions
325
+
326
+ #def bind(self, subex_over: SubExpressionOver):
327
+ # self.over_arg = subex_over.get_over_arg()
328
+ # if self.over_arg.endswith('.py'):
329
+ # self.over_field = 'child'
330
+ # self.over_pipe = UserPipeFactory.create(self.over_arg)
331
+ # self.upstream_source.set_source(self.over_pipe)
332
+ # self.subexp_ops.append(self.over_pipe)
333
+ # else:
334
+ # self.over_field = self.over_arg
280
335
 
281
336
  def reset(self):
282
337
  for op in self.subexp_ops:
@@ -284,60 +339,40 @@ class SubExpression(Pipe, ProgressIgnore):
284
339
  op.reset()
285
340
 
286
341
  def __iter__(self):
287
- for record in self.left:
288
- if self.over_pipe:
289
- one = UpstreamSource()
290
- one.add_item(record)
291
- self.over_pipe.set_sources([one])
292
- else:
293
- field_data = record.pop(self.over_field, None)
294
- if not field_data:
295
- yield record
296
- continue
297
- if isinstance(field_data, list):
298
- self.upstream_source.set_list(field_data)
299
- else:
300
- self.upstream_source.set_list([field_data])
342
+ yield from self.left # pass thru to subexp_over which then calls process
301
343
 
302
- # Reset sub-pipe stack
303
- for op in self.subexp_ops:
304
- op.reset()
344
+ def subexp_process(self, record: dict, over_field: str):
345
+ #for record in self.left:
346
+ # if self.over_pipe:
347
+ # one = UpstreamSource()
348
+ # one.add_item(record)
349
+ # self.over_pipe.set_sources([one])
305
350
 
306
- out_recs = []
307
- top = self.subexp_stack.peek()
308
- for rec in top:
309
- out_recs.append(rec)
351
+ if not self.subexp_left:
352
+ self.subexp_left = self.subexp_stack.pop()
310
353
 
311
- record[self.over_field] = out_recs
354
+ field_data = record.pop(over_field, None)
355
+ if not field_data:
356
+ return
312
357
 
313
- for op in self.subexp_ops:
314
- get_subexp = getattr(op, "get_subexp_result", None)
315
- if get_subexp:
316
- name, value = get_subexp()
317
- if name:
318
- record[name] = value
358
+ if isinstance(field_data, list):
359
+ self.upstream_source.set_list(field_data)
360
+ else:
361
+ self.upstream_source.set_list([field_data])
319
362
 
320
- yield record
363
+ # Reset sub-pipe stack
364
+ for op in self.subexp_ops:
365
+ op.reset()
321
366
 
322
- class SubExpressionOver(Pipe):
323
- @classmethod
324
- def usage(cls) -> Usage:
325
- u = Usage(
326
- name="over",
327
- desc="sub-expression over.",
328
- component_class=cls,
329
- )
330
- return u
331
-
332
- def __init__(self, ptok: ParsedToken, usage: Usage):
333
- super().__init__(ptok, usage)
334
- self.over_arg = ptok.get_arg(0)
367
+ out_recs = []
335
368
 
336
- def get_over_arg(self):
337
- return self.over_arg
369
+ for rec in self.subexp_left:
370
+ out_recs.append(rec)
338
371
 
339
- def reset(self):
340
- pass # stateless
372
+ record[over_field] = out_recs
341
373
 
342
- def __iter__(self):
343
- yield from self.left
374
+ for op in self.subexp_ops:
375
+ if isinstance(op, ReducePipe):
376
+ name, value = op.get_subexp_result()
377
+ if name:
378
+ record[name] = value
@@ -5,6 +5,7 @@
5
5
 
6
6
  from pjk.base import Pipe, ParsedToken, Usage, UsageError
7
7
  from typing import Iterator
8
+ from pjk.progress import papi
8
9
 
9
10
  class Denormer:
10
11
  def __init__(self, record, field):
@@ -23,7 +24,7 @@ class Denormer:
23
24
  elif isinstance(data, dict):
24
25
  self.subrec_list = [data]
25
26
  else:
26
- raise UsageError("can only denorm sub-records")
27
+ raise UsageError("can only explode sub-records")
27
28
 
28
29
  def __iter__(self) -> Iterator[dict]:
29
30
  for subrec in self.subrec_list:
@@ -53,8 +54,8 @@ class DenormPipe(Pipe):
53
54
  super().__init__(ptok)
54
55
 
55
56
  self.field = usage.get_arg('field')
56
- if not self.field:
57
- raise UsageError("denorm must include a field name")
57
+ self.recs_in = papi.get_counter(self, None) # don't display
58
+ self.recs_out = papi.get_percentage_counter(self, 'recs_out', self.recs_in)
58
59
 
59
60
  self._pending_iter = None
60
61
 
@@ -63,6 +64,8 @@ class DenormPipe(Pipe):
63
64
 
64
65
  def __iter__(self):
65
66
  for record in self.left:
67
+ self.recs_in.increment()
66
68
  denormer = Denormer(record, self.field)
67
69
  for out in denormer:
70
+ self.recs_out.increment()
68
71
  yield out
@@ -20,6 +20,7 @@ from pjk.pipes.select import SelectFields
20
20
  from pjk.pipes.denorm import DenormPipe
21
21
  from pjk.integrations.postgres_pipe import PostgresPipe
22
22
  from pjk.integrations.snowflake_pipe import SnowflakePipe
23
+ from pjk.integrations.opensearch_query_pipe import OpenSearchQueryPipe
23
24
  from pjk.pipes.sample import SamplePipe
24
25
  from pjk.pipes.user_pipe_factory import UserPipeFactory
25
26
 
@@ -40,7 +41,8 @@ COMPONENTS = {
40
41
  'sample': SamplePipe,
41
42
  'explode': DenormPipe,
42
43
  'postgres': PostgresPipe,
43
- 'snowflake': SnowflakePipe
44
+ 'snowflake': SnowflakePipe,
45
+ 'os_query': OpenSearchQueryPipe
44
46
  }
45
47
 
46
48
  class PipeFactory(ComponentFactory):
@@ -3,34 +3,33 @@
3
3
 
4
4
  # djk/pipes/move_field.py
5
5
 
6
- from pjk.base import Pipe, ParsedToken, Usage
6
+ from pjk.base import Pipe, ParsedToken, Usage, TokenError
7
+ from pjk.common import is_valid_field_name
7
8
 
8
9
  class MoveField(Pipe):
9
10
  @classmethod
10
11
  def usage(cls):
11
- usage = Usage(
12
+ u = Usage(
12
13
  name='as',
13
14
  desc='rename a field in the record',
14
15
  component_class=cls
15
16
  )
16
- usage.def_arg(name='src', usage='Source field name')
17
- usage.def_arg(name='dst', usage='Destination field name')
18
- usage.def_example(expr_tokens=['{up:1}', 'as:up:down'], expect="{down:1}")
17
+ u.def_arg(name='src', usage='Source field name')
18
+ u.def_arg(name='dst', usage='Destination field name')
19
+ u.def_example(expr_tokens=['{up:1}', 'as:up:down'], expect="{down:1}")
19
20
 
20
- return usage
21
+ return u
21
22
 
22
23
  def __init__(self, ptok: ParsedToken, usage: Usage):
23
24
  super().__init__(ptok, usage)
24
25
  self.src = usage.get_arg('src')
25
26
  self.dst = usage.get_arg('dst')
26
- self.count = 0
27
27
 
28
- def reset(self):
29
- self.count = 0
28
+ if not is_valid_field_name(self.dst) or not is_valid_field_name(self.src):
29
+ raise TokenError('field names only allow letters, numbers (non-initially) and underbar')
30
30
 
31
31
  def __iter__(self):
32
32
  for record in self.left:
33
- self.count += 1
34
33
  if self.src in record:
35
34
  record[self.dst] = record.pop(self.src)
36
35
  yield record
@@ -1,11 +1,9 @@
1
- from pjk.base import Pipe, ParsedToken, Usage, TokenError
2
- from pjk.common import Lookups
1
+ from pjk.base import Pipe, ParsedToken, Usage
3
2
  from typing import Any, Dict, Iterable, Optional
4
3
  from abc import abstractmethod
5
4
 
6
5
 
7
6
  class QueryPipe(Pipe):
8
- requires_lookups = True
9
7
  name: str = None
10
8
  desc: str = None
11
9
  arg0: tuple[Optional[str], Optional[str]] = (None, None)
@@ -18,8 +16,7 @@ class QueryPipe(Pipe):
18
16
  desc=cls.desc,
19
17
  component_class=cls
20
18
  )
21
- u.def_arg(name=cls.arg0[0], usage=f'{cls.arg0[1]} ~/.pjk/lookups.yaml must containing entry {cls.__name__}-<{cls.arg0[0]}>\n containing necessary parameters.')
22
- u.def_param("query_field", usage="field of query.", default="query")
19
+ u.def_arg(name=cls.arg0[0], usage=f"{cls.arg0[1]} ~/.pjk/component_configs.yaml must contain entry '{cls.__name__}-<{cls.arg0[0]}'>\n with necessary parameters.")
23
20
  u.def_param("count", usage="Number of search results, (databases may ignore)", is_num=True, default="10")
24
21
  u.def_param("shape", usage='the shape of ouput records', is_num=False,
25
22
  valid_values={'xR', 'Q_xR', 'Qxr'}, default='xR')
@@ -32,14 +29,8 @@ class QueryPipe(Pipe):
32
29
 
33
30
  def __init__(self, ptok: ParsedToken, usage: Usage):
34
31
  super().__init__(ptok, usage)
35
-
36
- self.lookup_params = None
37
- if type(self).requires_lookups:
38
- lookups = Lookups(self)
39
- lookup_name = usage.get_arg(type(self).arg0[0])
40
- self.lookup_params = lookups.get(lookup_name)
41
-
42
32
  self.output_shape = usage.get_param('shape')
33
+ self.count = usage.get_param('count')
43
34
 
44
35
  @abstractmethod
45
36
  def execute_query_returning_Q_xR_iterable(self, record) -> Iterable[Dict[str, Any]]:
@@ -83,7 +74,7 @@ class QueryPipe(Pipe):
83
74
  q_out = self._make_q_object(in_rec, out_rec)
84
75
  continue
85
76
  r_list.append(out_rec)
86
- q_out['result'] = r_list
77
+ q_out['child'] = r_list
87
78
  yield q_out
88
79
 
89
80
 
@@ -1,4 +1,4 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Copyright 2024 Mike Schultz
3
3
 
4
- __version__ = "0.6.0"
4
+ __version__ = "0.6.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-jack-knife
3
- Version: 0.6.0
3
+ Version: 0.6.3
4
4
  Summary: Python Jack Knife – a command line data processor
5
5
  Author-email: Mike Schultz <mike.schultz@gmail.com>
6
6
  License:
@@ -12,6 +12,8 @@ src/pjk/progress.py
12
12
  src/pjk/registry.py
13
13
  src/pjk/version.py
14
14
  src/pjk/integrations/ddb_sink.py
15
+ src/pjk/integrations/opensearch_client.py
16
+ src/pjk/integrations/opensearch_query_pipe.py
15
17
  src/pjk/integrations/postgres_pipe.py
16
18
  src/pjk/integrations/snowflake_pipe.py
17
19
  src/pjk/pipes/__init__.py