tableconv 1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tableconv/__init__.py +20 -0
- tableconv/__version__.py +3 -0
- tableconv/adapters/__init__.py +0 -0
- tableconv/adapters/df/__init__.py +25 -0
- tableconv/adapters/df/ascii.py +130 -0
- tableconv/adapters/df/aws_athena.py +333 -0
- tableconv/adapters/df/aws_dynamodb.py +37 -0
- tableconv/adapters/df/aws_logs.py +68 -0
- tableconv/adapters/df/base.py +65 -0
- tableconv/adapters/df/example.py +14 -0
- tableconv/adapters/df/file_adapter_mixin.py +69 -0
- tableconv/adapters/df/gsheets.py +438 -0
- tableconv/adapters/df/jc.py +57 -0
- tableconv/adapters/df/jira.py +13 -0
- tableconv/adapters/df/json.py +215 -0
- tableconv/adapters/df/leveldb.py +20 -0
- tableconv/adapters/df/nested_list.py +95 -0
- tableconv/adapters/df/numbers.py +32 -0
- tableconv/adapters/df/osquery.py +43 -0
- tableconv/adapters/df/pandas_io.py +238 -0
- tableconv/adapters/df/pcap.py +73 -0
- tableconv/adapters/df/python.py +39 -0
- tableconv/adapters/df/rdbms.py +152 -0
- tableconv/adapters/df/smart_sheet.py +72 -0
- tableconv/adapters/df/sql_literal.py +24 -0
- tableconv/adapters/df/sumo_logic.py +195 -0
- tableconv/adapters/df/text_array.py +146 -0
- tableconv/adapters/df/yaml.py +43 -0
- tableconv/core.py +427 -0
- tableconv/exceptions.py +118 -0
- tableconv/in_memory_query.py +93 -0
- tableconv/interactive.py +182 -0
- tableconv/main.py +358 -0
- tableconv/parse_time.py +21 -0
- tableconv/uri.py +47 -0
- tableconv-1.8.dist-info/METADATA +297 -0
- tableconv-1.8.dist-info/RECORD +42 -0
- tableconv-1.8.dist-info/WHEEL +4 -0
- tableconv-1.8.dist-info/entry_points.txt +2 -0
- tableconv-1.8.dist-info/licenses/LICENSE +7 -0
- tableconv_daemon/__init__.py +0 -0
- tableconv_daemon/main.py +247 -0
tableconv/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
# One gripe of python is some libraries immediately start logging unhelp stuff the moment you import them.
|
|
4
|
+
# This triggers flake8 E402 (code before imports error). How are you supposed to resolve this cleanly?
|
|
5
|
+
logging.getLogger("numexpr").setLevel(logging.ERROR)
|
|
6
|
+
|
|
7
|
+
from .__version__ import __version__ # noqa: E402
|
|
8
|
+
from .core import IntermediateExchangeTable, load_url # noqa: E402
|
|
9
|
+
from .exceptions import DataError, EmptyDataError, InvalidQueryError, InvalidURLError, SuppliedDataError # noqa: E402
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"IntermediateExchangeTable",
|
|
13
|
+
"load_url",
|
|
14
|
+
"EmptyDataError",
|
|
15
|
+
"DataError",
|
|
16
|
+
"InvalidQueryError",
|
|
17
|
+
"InvalidURLError",
|
|
18
|
+
"SuppliedDataError",
|
|
19
|
+
"__version__",
|
|
20
|
+
]
|
tableconv/__version__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from tableconv.adapters.df.base import adapters, read_adapters, write_adapters # noqa: F401
|
|
2
|
+
|
|
3
|
+
from .ascii import * # noqa: F401 F403
|
|
4
|
+
from .aws_athena import * # noqa: F401 F403
|
|
5
|
+
from .aws_dynamodb import * # noqa: F401 F403
|
|
6
|
+
from .aws_logs import * # noqa: F401 F403
|
|
7
|
+
from .gsheets import * # noqa: F401 F403
|
|
8
|
+
from .jc import * # noqa: F401 F403
|
|
9
|
+
from .jira import * # noqa: F401 F403
|
|
10
|
+
from .json import * # noqa: F401 F403
|
|
11
|
+
from .leveldb import * # noqa: F401 F403
|
|
12
|
+
from .nested_list import * # noqa: F401 F403
|
|
13
|
+
from .numbers import * # noqa: F401 F403
|
|
14
|
+
from .osquery import * # noqa: F401 F403
|
|
15
|
+
from .pandas_io import * # noqa: F401 F403
|
|
16
|
+
from .pcap import * # noqa: F401 F403
|
|
17
|
+
from .python import * # noqa: F401 F403
|
|
18
|
+
from .rdbms import * # noqa: F401 F403
|
|
19
|
+
from .smart_sheet import * # noqa: F401 F403
|
|
20
|
+
from .sql_literal import * # noqa: F401 F403
|
|
21
|
+
from .sumo_logic import * # noqa: F401 F403
|
|
22
|
+
from .text_array import * # noqa: F401 F403
|
|
23
|
+
from .yaml import * # noqa: F401 F403
|
|
24
|
+
|
|
25
|
+
# TODO: Register adapters in a cleaner way (dynamic adapter loading?). Just get rid of the `import *`.
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from tableconv.adapters.df.base import Adapter, register_adapter
|
|
2
|
+
from tableconv.adapters.df.file_adapter_mixin import FileAdapterMixin
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _render_value(value):
|
|
6
|
+
if value is None:
|
|
7
|
+
return ""
|
|
8
|
+
return str(value).replace("\n", "\\n")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _get_serialized_rows(rows):
|
|
12
|
+
return [{key: _render_value(value) for key, value in row.items()} for row in rows]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _get_column_max_lengths(rows, column_names):
|
|
16
|
+
return {column: max([len(row[column]) for row in rows] + [len(column)]) for column in column_names}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def render_asciilite(ordered_fields, rows):
|
|
20
|
+
"""Text table rendering inspired by sqlite CLI."""
|
|
21
|
+
output_lines = []
|
|
22
|
+
for row in rows:
|
|
23
|
+
sorted_values = [row[field] for field in ordered_fields]
|
|
24
|
+
serialized_value = [_render_value(value) for value in sorted_values]
|
|
25
|
+
output_lines.append("|".join(serialized_value))
|
|
26
|
+
return "\n".join(output_lines)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def render_unicodebox(ordered_fields, rows):
|
|
30
|
+
"""Text table rendering inspired by ClickHouse."""
|
|
31
|
+
serialized_rows = _get_serialized_rows(rows)
|
|
32
|
+
max_lengths = _get_column_max_lengths(serialized_rows, ordered_fields)
|
|
33
|
+
|
|
34
|
+
output_lines = []
|
|
35
|
+
output_lines.append("┌─" + "─┬─".join([field.ljust(max_lengths[field], "─") for field in ordered_fields]) + "─┐")
|
|
36
|
+
for row in serialized_rows:
|
|
37
|
+
rendered_values_list = []
|
|
38
|
+
for field in ordered_fields:
|
|
39
|
+
rendered_values_list.append(row[field].ljust(max_lengths[field]))
|
|
40
|
+
output_lines.append("│ " + " │ ".join(rendered_values_list) + " │")
|
|
41
|
+
output_lines.append("└─" + "─┴─".join(["─" * max_lengths[field] for field in ordered_fields]) + "─┘")
|
|
42
|
+
return "\n".join(output_lines)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@register_adapter(
|
|
46
|
+
[
|
|
47
|
+
"ascii",
|
|
48
|
+
"asciiplain",
|
|
49
|
+
"asciisimple",
|
|
50
|
+
"asciigrid",
|
|
51
|
+
"asciifancygrid",
|
|
52
|
+
"asciipipe",
|
|
53
|
+
"asciipresto",
|
|
54
|
+
"asciipretty",
|
|
55
|
+
"asciipsql",
|
|
56
|
+
"asciilite",
|
|
57
|
+
"asciibox",
|
|
58
|
+
"mediawikiformat",
|
|
59
|
+
"moinmoinformat",
|
|
60
|
+
"jiraformat",
|
|
61
|
+
"markdown",
|
|
62
|
+
"md",
|
|
63
|
+
"rst",
|
|
64
|
+
"html",
|
|
65
|
+
"latex",
|
|
66
|
+
"tex",
|
|
67
|
+
],
|
|
68
|
+
write_only=True,
|
|
69
|
+
)
|
|
70
|
+
class ASCIIAdapter(FileAdapterMixin, Adapter):
|
|
71
|
+
"""
|
|
72
|
+
Adapter focused on outputting ascii art style table renderings, such as those found in database CLIs.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
text_based = True
|
|
76
|
+
|
|
77
|
+
# @staticmethod
|
|
78
|
+
# def _transform_df(df):
|
|
79
|
+
# def transform(obj):
|
|
80
|
+
# if isinstance(obj, datetime.datetime):
|
|
81
|
+
# if obj.tzinfo is not None:
|
|
82
|
+
# obj = obj.astimezone(datetime.timezone.utc)
|
|
83
|
+
# # Warning: Interpret naive TS as being UTC.
|
|
84
|
+
# return obj.strftime('%Y-%m-%d %H:%M:%S')
|
|
85
|
+
# elif isinstance(obj, list) or isinstance(obj, dict):
|
|
86
|
+
# return str(obj)
|
|
87
|
+
# return obj
|
|
88
|
+
# df = df.applymap(transform)
|
|
89
|
+
# df = df.replace({np.nan: None})
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def get_example_url(scheme):
|
|
93
|
+
return f"{scheme}:-"
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def dump_text_data(df, scheme, params):
|
|
97
|
+
TABULATE_TABLEFMT = {
|
|
98
|
+
"ascii": "simple",
|
|
99
|
+
"asciiplain": "plain",
|
|
100
|
+
"asciisimple": "simple",
|
|
101
|
+
"md": "github",
|
|
102
|
+
"markdown": "github",
|
|
103
|
+
"asciigrid": "grid",
|
|
104
|
+
"asciifancygrid": "fancy_grid",
|
|
105
|
+
"asciipipe": "pipe",
|
|
106
|
+
"asciipresto": "presto",
|
|
107
|
+
"asciipretty": "pretty",
|
|
108
|
+
"asciipsql": "psql",
|
|
109
|
+
"mediawikiformat": "mediawiki",
|
|
110
|
+
"moinmoinformat": "moinmoin",
|
|
111
|
+
"jiraformat": "jira",
|
|
112
|
+
"rst": "rst",
|
|
113
|
+
"latex": "latex",
|
|
114
|
+
"tex": "latex",
|
|
115
|
+
}
|
|
116
|
+
if scheme in TABULATE_TABLEFMT:
|
|
117
|
+
from tabulate import tabulate
|
|
118
|
+
|
|
119
|
+
return tabulate(
|
|
120
|
+
df.values.tolist(),
|
|
121
|
+
list(df.columns),
|
|
122
|
+
tablefmt=TABULATE_TABLEFMT[scheme],
|
|
123
|
+
disable_numparse=True,
|
|
124
|
+
)
|
|
125
|
+
elif scheme == "asciilite":
|
|
126
|
+
return render_asciilite(list(df.columns), df.to_dict("records"))
|
|
127
|
+
elif scheme == "asciibox":
|
|
128
|
+
return render_unicodebox(list(df.columns), df.to_dict("records"))
|
|
129
|
+
else:
|
|
130
|
+
raise AssertionError()
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
import textwrap
|
|
7
|
+
import time
|
|
8
|
+
import uuid
|
|
9
|
+
|
|
10
|
+
from tableconv.adapters.df.base import Adapter, register_adapter
|
|
11
|
+
from tableconv.adapters.df.pandas_io import CSVAdapter, ParquetAdapter
|
|
12
|
+
from tableconv.exceptions import (
|
|
13
|
+
AppendSchemeConflictError,
|
|
14
|
+
InvalidParamsError,
|
|
15
|
+
InvalidQueryError,
|
|
16
|
+
TableAlreadyExistsError,
|
|
17
|
+
)
|
|
18
|
+
from tableconv.uri import parse_uri
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
FORMAT_SQL_MAPPING = {
|
|
24
|
+
"parquet": textwrap.dedent(
|
|
25
|
+
"""
|
|
26
|
+
ROW FORMAT SERDE
|
|
27
|
+
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
|
|
28
|
+
STORED AS INPUTFORMAT
|
|
29
|
+
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
|
|
30
|
+
OUTPUTFORMAT
|
|
31
|
+
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
|
|
32
|
+
"""
|
|
33
|
+
),
|
|
34
|
+
"csv": textwrap.dedent(
|
|
35
|
+
"""
|
|
36
|
+
ROW FORMAT DELIMITED
|
|
37
|
+
FIELDS TERMINATED BY ','
|
|
38
|
+
STORED AS INPUTFORMAT
|
|
39
|
+
'org.apache.hadoop.mapred.TextInputFormat'
|
|
40
|
+
OUTPUTFORMAT
|
|
41
|
+
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
|
|
42
|
+
"""
|
|
43
|
+
),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@register_adapter(["awsathena"])
|
|
48
|
+
class AWSAthenaAdapter(Adapter):
|
|
49
|
+
@staticmethod
|
|
50
|
+
def get_example_url(scheme):
|
|
51
|
+
return f"{scheme}://eu-central-1"
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def load(uri, query):
|
|
55
|
+
uri = parse_uri(uri)
|
|
56
|
+
aws_region = uri.authority
|
|
57
|
+
|
|
58
|
+
return AWSAthenaAdapter._run_athena_query(
|
|
59
|
+
query=query, aws_region=aws_region, catalog="AwsDataCatalog", database=None, return_results_df=True
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def _run_athena_query(
|
|
64
|
+
query, aws_region, catalog, database, return_results_raw=False, return_results_df=False, athena_client=None
|
|
65
|
+
):
|
|
66
|
+
import boto3
|
|
67
|
+
|
|
68
|
+
if not athena_client:
|
|
69
|
+
athena_client = boto3.client("athena", region_name=aws_region)
|
|
70
|
+
sts = boto3.client("sts", region_name=aws_region)
|
|
71
|
+
s3 = boto3.client("s3", region_name=aws_region)
|
|
72
|
+
|
|
73
|
+
aws_account_id = sts.get_caller_identity()["Account"]
|
|
74
|
+
output_s3_bucket = f"aws-athena-query-results-{aws_account_id}-{aws_region}"
|
|
75
|
+
|
|
76
|
+
logger.debug(f"Querying.. aws_region={aws_region}, catalog={catalog}, output_s3_bucket={output_s3_bucket}.")
|
|
77
|
+
query_execution_context = {"Catalog": catalog}
|
|
78
|
+
if database:
|
|
79
|
+
query_execution_context["Database"] = database
|
|
80
|
+
query_req_resp = athena_client.start_query_execution(
|
|
81
|
+
QueryString=query,
|
|
82
|
+
QueryExecutionContext=query_execution_context,
|
|
83
|
+
WorkGroup="primary",
|
|
84
|
+
ResultConfiguration={"OutputLocation": f"s3://{output_s3_bucket}/"},
|
|
85
|
+
)
|
|
86
|
+
query_execution_id = query_req_resp["QueryExecutionId"]
|
|
87
|
+
logger.info(f"Waiting for AWS Athena query {query_execution_id}...")
|
|
88
|
+
|
|
89
|
+
while True:
|
|
90
|
+
details = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
|
|
91
|
+
status = details["QueryExecution"]["Status"]["State"]
|
|
92
|
+
if status in ("FAILED", "CANCELLED"):
|
|
93
|
+
error_message = details["QueryExecution"]["Status"].get("StateChangeReason")
|
|
94
|
+
raise InvalidQueryError(f"AWS Athena Query {status.lower()}: {error_message}")
|
|
95
|
+
elif status == "SUCCEEDED":
|
|
96
|
+
break
|
|
97
|
+
else:
|
|
98
|
+
POLL_INTERVAL = datetime.timedelta(seconds=2)
|
|
99
|
+
time.sleep(POLL_INTERVAL.total_seconds())
|
|
100
|
+
|
|
101
|
+
if return_results_raw:
|
|
102
|
+
response = athena_client.get_query_results(QueryExecutionId=query_execution_id)
|
|
103
|
+
return response["ResultSet"]
|
|
104
|
+
if return_results_df:
|
|
105
|
+
output_s3_key = f"{query_execution_id}.csv"
|
|
106
|
+
local_filename = f"/tmp/awsathena-{query_execution_id}.csv"
|
|
107
|
+
try:
|
|
108
|
+
s3.download_file(output_s3_bucket, output_s3_key, local_filename)
|
|
109
|
+
df = CSVAdapter.load(f"csv://{local_filename}", None)
|
|
110
|
+
finally:
|
|
111
|
+
with contextlib.suppress(FileNotFoundError):
|
|
112
|
+
os.remove(local_filename)
|
|
113
|
+
return df
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def _get_json_schema(df):
|
|
117
|
+
"""
|
|
118
|
+
TODO: Update aws_athena to not be a df-level adapter, and instead use IntermediateExchangeTable, so we don't
|
|
119
|
+
have to duplicate this logic.
|
|
120
|
+
"""
|
|
121
|
+
from genson import SchemaBuilder
|
|
122
|
+
|
|
123
|
+
builder = SchemaBuilder()
|
|
124
|
+
builder.add_schema({"type": "object", "properties": {}})
|
|
125
|
+
for row in df.to_dict(orient="records"):
|
|
126
|
+
builder.add_object(row)
|
|
127
|
+
return builder.to_schema()
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def resolve_presto_type(json_schema, column_name=None, top_level=False):
|
|
131
|
+
presto_types = set()
|
|
132
|
+
if "type" in json_schema:
|
|
133
|
+
if isinstance(json_schema["type"], str):
|
|
134
|
+
json_types = {json_schema["type"]}
|
|
135
|
+
else:
|
|
136
|
+
assert isinstance(json_schema["type"], list)
|
|
137
|
+
json_types = json_schema["type"]
|
|
138
|
+
for json_type in json_types:
|
|
139
|
+
if json_type == "array":
|
|
140
|
+
if "items" in json_schema:
|
|
141
|
+
array_type = AWSAthenaAdapter.resolve_presto_type(json_schema["items"])
|
|
142
|
+
else:
|
|
143
|
+
array_type = "string"
|
|
144
|
+
presto_types.add(f"array<{array_type}>")
|
|
145
|
+
elif json_type == "null":
|
|
146
|
+
pass
|
|
147
|
+
else:
|
|
148
|
+
presto_types.add(
|
|
149
|
+
{
|
|
150
|
+
"integer": "bigint",
|
|
151
|
+
"string": "string",
|
|
152
|
+
"boolean": "boolean",
|
|
153
|
+
"number": "double",
|
|
154
|
+
"object": "string",
|
|
155
|
+
}[json_type]
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
assert "anyOf" in json_schema
|
|
159
|
+
for sub_definition in json_schema["anyOf"]:
|
|
160
|
+
presto_types.add(AWSAthenaAdapter.resolve_presto_type(sub_definition))
|
|
161
|
+
if "null" in presto_types and presto_types != {"null"}:
|
|
162
|
+
presto_types.remove("null")
|
|
163
|
+
if "double" in presto_types and presto_types != {"double"}:
|
|
164
|
+
# hide NaN corruption added by pandas (pandas converts nulls to NaN, which then cause the column to get
|
|
165
|
+
# misidentified (or so I argue) as containing doubles)
|
|
166
|
+
presto_types.remove("double")
|
|
167
|
+
if len(presto_types) > 1:
|
|
168
|
+
if top_level:
|
|
169
|
+
logger.warning(
|
|
170
|
+
f"Identified multiple conflicting types for {column_name}: {presto_types}. Picking one "
|
|
171
|
+
+ "arbitrarily."
|
|
172
|
+
)
|
|
173
|
+
presto_types = {presto_types.pop()}
|
|
174
|
+
if len(presto_types) == 0:
|
|
175
|
+
if top_level:
|
|
176
|
+
logger.warning(f"Unable to identify type of column {column_name}. Picking string.")
|
|
177
|
+
presto_types = {"string"}
|
|
178
|
+
return presto_types.pop()
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def _gen_schema(df, data_format, table_name, s3_base_url):
|
|
182
|
+
schema = f"CREATE EXTERNAL TABLE `{table_name}` (\n"
|
|
183
|
+
field_schema_lines = []
|
|
184
|
+
columns = []
|
|
185
|
+
for column, json_schema in AWSAthenaAdapter._get_json_schema(df)["properties"].items():
|
|
186
|
+
presto_type = AWSAthenaAdapter.resolve_presto_type(json_schema, column_name=column, top_level=True)
|
|
187
|
+
field_schema_lines.append(f" `{column}` {presto_type}")
|
|
188
|
+
columns.append(column)
|
|
189
|
+
schema += ",\n".join(field_schema_lines)
|
|
190
|
+
schema += "\n)"
|
|
191
|
+
schema += FORMAT_SQL_MAPPING[data_format]
|
|
192
|
+
schema += f"LOCATION\n '{s3_base_url}'"
|
|
193
|
+
return schema, columns
|
|
194
|
+
|
|
195
|
+
@staticmethod
|
|
196
|
+
def dump(df, uri):
|
|
197
|
+
import boto3
|
|
198
|
+
|
|
199
|
+
uri = parse_uri(uri)
|
|
200
|
+
if "if_exists" in uri.query:
|
|
201
|
+
if_exists = uri.query["if_exists"]
|
|
202
|
+
elif "append" in uri.query and uri.query["append"].lower() != "false":
|
|
203
|
+
if_exists = "append"
|
|
204
|
+
elif "overwrite" in uri.query and uri.query["overwrite"].lower() != "false":
|
|
205
|
+
if_exists = "replace"
|
|
206
|
+
else:
|
|
207
|
+
if_exists = "fail"
|
|
208
|
+
|
|
209
|
+
if if_exists not in ("replace", "append", "fail"):
|
|
210
|
+
raise InvalidParamsError("valid values for if_exists are replace, append, or fail (default)")
|
|
211
|
+
|
|
212
|
+
aws_region = uri.authority
|
|
213
|
+
|
|
214
|
+
athena_client = boto3.client("athena", region_name=aws_region)
|
|
215
|
+
catalog = "AwsDataCatalog"
|
|
216
|
+
database = uri.path.strip("/")
|
|
217
|
+
table_name = uri.query["table"]
|
|
218
|
+
|
|
219
|
+
s3_bucket_path = uri.query["s3_bucket_path"]
|
|
220
|
+
s3_bucket_path_split = os.path.split(s3_bucket_path)
|
|
221
|
+
if s3_bucket_path_split[0]:
|
|
222
|
+
s3_bucket = s3_bucket_path_split[0]
|
|
223
|
+
s3_bucket_prefix = os.path.join(s3_bucket_path_split[1], table_name)
|
|
224
|
+
else:
|
|
225
|
+
s3_bucket = s3_bucket_path_split[1]
|
|
226
|
+
s3_bucket_prefix = table_name
|
|
227
|
+
data_format = uri.query["data_format"]
|
|
228
|
+
if data_format not in FORMAT_SQL_MAPPING:
|
|
229
|
+
raise InvalidParamsError(f"Only formats {FORMAT_SQL_MAPPING.keys()} supported")
|
|
230
|
+
s3_base_url = f"s3://{os.path.join(s3_bucket, s3_bucket_prefix)}"
|
|
231
|
+
|
|
232
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
233
|
+
# Dump to temp file on disk
|
|
234
|
+
filename = f"{uuid.uuid4()}.{data_format}"
|
|
235
|
+
temp_file_path = os.path.join(temp_dir, filename)
|
|
236
|
+
if data_format == "csv":
|
|
237
|
+
CSVAdapter.dump(df, uri=temp_file_path)
|
|
238
|
+
elif data_format == "parquet":
|
|
239
|
+
ParquetAdapter.dump(df, uri=temp_file_path)
|
|
240
|
+
else:
|
|
241
|
+
raise AssertionError
|
|
242
|
+
|
|
243
|
+
# Manage Table DDL
|
|
244
|
+
schema_ddl, columns = AWSAthenaAdapter._gen_schema(df, data_format, table_name, s3_base_url)
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
table_metadata = athena_client.get_table_metadata(
|
|
248
|
+
CatalogName=catalog, DatabaseName=database, TableName=table_name
|
|
249
|
+
)
|
|
250
|
+
table_exists = True
|
|
251
|
+
except athena_client.exceptions.MetadataException as exc:
|
|
252
|
+
if "EntityNotFoundException" in exc.response["Error"]["Message"]:
|
|
253
|
+
table_exists = False
|
|
254
|
+
else:
|
|
255
|
+
raise
|
|
256
|
+
if table_exists:
|
|
257
|
+
if if_exists == "fail":
|
|
258
|
+
raise TableAlreadyExistsError(f"{database}{table_name} already exists")
|
|
259
|
+
elif if_exists == "append":
|
|
260
|
+
pre_existing_columns = [col["Name"] for col in table_metadata["TableMetadata"]["Columns"]]
|
|
261
|
+
if not pre_existing_columns == columns:
|
|
262
|
+
raise AppendSchemeConflictError("Cannot append to existing table - schema mismatch")
|
|
263
|
+
pre_existing_s3_base_url = table_metadata["TableMetadata"]["Parameters"]["location"].strip("/")
|
|
264
|
+
if pre_existing_s3_base_url != s3_base_url.strip("/"):
|
|
265
|
+
existing_uri = parse_uri(pre_existing_s3_base_url)
|
|
266
|
+
existing_bucket = existing_uri.authority
|
|
267
|
+
existing_prefix = existing_uri.path.strip("/")
|
|
268
|
+
if existing_bucket != s3_bucket:
|
|
269
|
+
raise AppendSchemeConflictError(
|
|
270
|
+
"Cannot append to existing table - s3 bucket mismatch "
|
|
271
|
+
+ f"(pre-existing location is {pre_existing_s3_base_url})"
|
|
272
|
+
)
|
|
273
|
+
if existing_prefix.startswith(s3_bucket_prefix):
|
|
274
|
+
# Discovered prefix is more restrictive than our requested prefix - this is safe, we can
|
|
275
|
+
# just adopt it.
|
|
276
|
+
logger.warning(f"Appending to found pre-existing prefix at {existing_prefix}")
|
|
277
|
+
s3_bucket_prefix = existing_prefix
|
|
278
|
+
s3_base_url = f"s3://{os.path.join(s3_bucket, s3_bucket_prefix)}"
|
|
279
|
+
schema_ddl = None # Invalidate the now-outdated schema
|
|
280
|
+
elif if_exists == "replace":
|
|
281
|
+
s3_bucket_prefix = os.path.join(s3_bucket_prefix, str(uuid.uuid4()))
|
|
282
|
+
s3_base_url = f"s3://{os.path.join(s3_bucket, s3_bucket_prefix)}"
|
|
283
|
+
schema_ddl, _ = AWSAthenaAdapter._gen_schema(df, data_format, table_name, s3_base_url)
|
|
284
|
+
logger.warning(
|
|
285
|
+
f"Deleting table definition for {database}.{table_name}. Leaving old data behind and changing "
|
|
286
|
+
+ f"prefix to {s3_bucket_prefix}/."
|
|
287
|
+
)
|
|
288
|
+
assert s3_base_url and schema_ddl and s3_bucket_prefix # safety check
|
|
289
|
+
old_table_schema_query_result = AWSAthenaAdapter._run_athena_query(
|
|
290
|
+
query=f"SHOW CREATE TABLE `{table_name}`",
|
|
291
|
+
return_results_raw=True,
|
|
292
|
+
aws_region=aws_region,
|
|
293
|
+
catalog="AwsDataCatalog",
|
|
294
|
+
database=database,
|
|
295
|
+
athena_client=athena_client,
|
|
296
|
+
)
|
|
297
|
+
old_table_schema = "\n".join(
|
|
298
|
+
[x["Data"][0]["VarCharValue"] for x in old_table_schema_query_result["Rows"]]
|
|
299
|
+
)
|
|
300
|
+
logger.debug(
|
|
301
|
+
"Backup of old table schema before deleting:\n" + textwrap.indent(old_table_schema, " ")
|
|
302
|
+
)
|
|
303
|
+
AWSAthenaAdapter._run_athena_query(
|
|
304
|
+
query=f"DROP TABLE `{table_name}`",
|
|
305
|
+
aws_region=aws_region,
|
|
306
|
+
catalog="AwsDataCatalog",
|
|
307
|
+
database=database,
|
|
308
|
+
athena_client=athena_client,
|
|
309
|
+
)
|
|
310
|
+
# It's just too dangerous to actually delete any data; commented out.
|
|
311
|
+
# s3 = boto3.resource('s3')
|
|
312
|
+
# bucket = s3.Bucket(s3_bucket)
|
|
313
|
+
# bucket.objects.filter(Prefix=f'{s3_bucket_prefix}/').delete()
|
|
314
|
+
table_exists = False
|
|
315
|
+
else:
|
|
316
|
+
raise AssertionError
|
|
317
|
+
if not table_exists:
|
|
318
|
+
logger.info(f"Creating new table {database}.{table_name} in {aws_region}")
|
|
319
|
+
logger.debug("\n" + textwrap.indent(schema_ddl, " "))
|
|
320
|
+
AWSAthenaAdapter._run_athena_query(
|
|
321
|
+
query=schema_ddl,
|
|
322
|
+
return_results_raw=True,
|
|
323
|
+
aws_region=aws_region,
|
|
324
|
+
catalog="AwsDataCatalog",
|
|
325
|
+
database=database,
|
|
326
|
+
athena_client=athena_client,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Upload temp file to s3
|
|
330
|
+
s3_client = boto3.client("s3")
|
|
331
|
+
s3_object_key = os.path.join(s3_bucket_prefix, filename)
|
|
332
|
+
logger.info(f"Uploading data to s3://{os.path.join(s3_bucket, s3_object_key)}")
|
|
333
|
+
s3_client.upload_file(temp_file_path, s3_bucket, s3_object_key)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from tableconv.adapters.df.base import Adapter, register_adapter
|
|
6
|
+
from tableconv.uri import parse_uri
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_adapter(["awsdynamodb"], read_only=True)
|
|
12
|
+
class AWSDynamoDB(Adapter):
|
|
13
|
+
@staticmethod
|
|
14
|
+
def get_example_url(scheme):
|
|
15
|
+
return f"{scheme}://eu-central-1/example_table"
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def load(uri, query):
|
|
19
|
+
import boto3
|
|
20
|
+
|
|
21
|
+
uri = parse_uri(uri)
|
|
22
|
+
aws_region = uri.authority
|
|
23
|
+
table_name = uri.path.strip("/")
|
|
24
|
+
|
|
25
|
+
dynamodb = boto3.client("dynamodb", region_name=aws_region)
|
|
26
|
+
|
|
27
|
+
if query:
|
|
28
|
+
result = dynamodb.execute_statement(Statement=query)
|
|
29
|
+
raw_array = result["Items"]
|
|
30
|
+
else:
|
|
31
|
+
logger.info("Sequentially querying DynamoDB scan results...")
|
|
32
|
+
scan_results = dynamodb.get_paginator("scan").paginate(TableName=table_name)
|
|
33
|
+
raw_array = []
|
|
34
|
+
for response in scan_results:
|
|
35
|
+
raw_array.extend(response["Items"])
|
|
36
|
+
|
|
37
|
+
return pd.DataFrame.from_records(raw_array)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from tableconv.adapters.df.base import Adapter, register_adapter
|
|
9
|
+
from tableconv.exceptions import InvalidQueryError
|
|
10
|
+
from tableconv.parse_time import parse_input_time
|
|
11
|
+
from tableconv.uri import parse_uri
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_adapter(["awslogs"], read_only=True)
|
|
17
|
+
class AWSLogs(Adapter):
|
|
18
|
+
"""AWS Cloudwatch Logs (Disclaimer: Only supports Logs Insights queries for now)"""
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def get_example_url(scheme):
|
|
22
|
+
return f"{scheme}://eu-central-1//aws/lambda/example-function"
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def load(uri, query):
|
|
26
|
+
import boto3
|
|
27
|
+
|
|
28
|
+
uri = parse_uri(uri)
|
|
29
|
+
aws_region = uri.authority
|
|
30
|
+
|
|
31
|
+
from_time = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=1)
|
|
32
|
+
to_time = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
33
|
+
if "from" in uri.query:
|
|
34
|
+
from_time = parse_input_time(uri.query["from"])
|
|
35
|
+
if "to" in uri.query:
|
|
36
|
+
to_time = parse_input_time(uri.query["to"])
|
|
37
|
+
client = boto3.client("logs", region_name=aws_region)
|
|
38
|
+
|
|
39
|
+
path = uri.path
|
|
40
|
+
if path[0] == "/":
|
|
41
|
+
path = path[1:]
|
|
42
|
+
|
|
43
|
+
query_id = client.start_query(
|
|
44
|
+
logGroupName=path,
|
|
45
|
+
startTime=int(from_time.timestamp()),
|
|
46
|
+
endTime=int(to_time.timestamp()),
|
|
47
|
+
queryString=query,
|
|
48
|
+
limit=int(uri.query.get("limit", 1000)),
|
|
49
|
+
)["queryId"]
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
while True:
|
|
53
|
+
results = client.get_query_results(queryId=query_id)
|
|
54
|
+
if results["status"] in ("Failed", "Timeout", "Unknown"):
|
|
55
|
+
raise InvalidQueryError(f'AWS CloudWatch Logs Insights Query {results["status"]}.')
|
|
56
|
+
elif results["status"] == "Complete":
|
|
57
|
+
raw_array = [{item["field"]: item["value"] for item in row} for row in results["results"]]
|
|
58
|
+
break
|
|
59
|
+
else:
|
|
60
|
+
assert results["status"] in ("Running", "Scheduled")
|
|
61
|
+
POLL_INTERVAL = datetime.timedelta(seconds=2)
|
|
62
|
+
time.sleep(POLL_INTERVAL.total_seconds())
|
|
63
|
+
except Exception as exc:
|
|
64
|
+
with contextlib.suppress(Exception):
|
|
65
|
+
client.stop_query(query_id)
|
|
66
|
+
raise exc
|
|
67
|
+
|
|
68
|
+
return pd.DataFrame.from_records(raw_array)
|