tableconv 1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. tableconv/__init__.py +20 -0
  2. tableconv/__version__.py +3 -0
  3. tableconv/adapters/__init__.py +0 -0
  4. tableconv/adapters/df/__init__.py +25 -0
  5. tableconv/adapters/df/ascii.py +130 -0
  6. tableconv/adapters/df/aws_athena.py +333 -0
  7. tableconv/adapters/df/aws_dynamodb.py +37 -0
  8. tableconv/adapters/df/aws_logs.py +68 -0
  9. tableconv/adapters/df/base.py +65 -0
  10. tableconv/adapters/df/example.py +14 -0
  11. tableconv/adapters/df/file_adapter_mixin.py +69 -0
  12. tableconv/adapters/df/gsheets.py +438 -0
  13. tableconv/adapters/df/jc.py +57 -0
  14. tableconv/adapters/df/jira.py +13 -0
  15. tableconv/adapters/df/json.py +215 -0
  16. tableconv/adapters/df/leveldb.py +20 -0
  17. tableconv/adapters/df/nested_list.py +95 -0
  18. tableconv/adapters/df/numbers.py +32 -0
  19. tableconv/adapters/df/osquery.py +43 -0
  20. tableconv/adapters/df/pandas_io.py +238 -0
  21. tableconv/adapters/df/pcap.py +73 -0
  22. tableconv/adapters/df/python.py +39 -0
  23. tableconv/adapters/df/rdbms.py +152 -0
  24. tableconv/adapters/df/smart_sheet.py +72 -0
  25. tableconv/adapters/df/sql_literal.py +24 -0
  26. tableconv/adapters/df/sumo_logic.py +195 -0
  27. tableconv/adapters/df/text_array.py +146 -0
  28. tableconv/adapters/df/yaml.py +43 -0
  29. tableconv/core.py +427 -0
  30. tableconv/exceptions.py +118 -0
  31. tableconv/in_memory_query.py +93 -0
  32. tableconv/interactive.py +182 -0
  33. tableconv/main.py +358 -0
  34. tableconv/parse_time.py +21 -0
  35. tableconv/uri.py +47 -0
  36. tableconv-1.8.dist-info/METADATA +297 -0
  37. tableconv-1.8.dist-info/RECORD +42 -0
  38. tableconv-1.8.dist-info/WHEEL +4 -0
  39. tableconv-1.8.dist-info/entry_points.txt +2 -0
  40. tableconv-1.8.dist-info/licenses/LICENSE +7 -0
  41. tableconv_daemon/__init__.py +0 -0
  42. tableconv_daemon/main.py +247 -0
tableconv/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ import logging
2
+
3
+ # One gripe of python is some libraries immediately start logging unhelp stuff the moment you import them.
4
+ # This triggers flake8 E402 (code before imports error). How are you supposed to resolve this cleanly?
5
+ logging.getLogger("numexpr").setLevel(logging.ERROR)
6
+
7
+ from .__version__ import __version__ # noqa: E402
8
+ from .core import IntermediateExchangeTable, load_url # noqa: E402
9
+ from .exceptions import DataError, EmptyDataError, InvalidQueryError, InvalidURLError, SuppliedDataError # noqa: E402
10
+
11
+ __all__ = [
12
+ "IntermediateExchangeTable",
13
+ "load_url",
14
+ "EmptyDataError",
15
+ "DataError",
16
+ "InvalidQueryError",
17
+ "InvalidURLError",
18
+ "SuppliedDataError",
19
+ "__version__",
20
+ ]
@@ -0,0 +1,3 @@
1
+ VERSION = (1, 9985, 20250218)
2
+
3
+ __version__ = ".".join(map(str, VERSION))
File without changes
@@ -0,0 +1,25 @@
1
+ from tableconv.adapters.df.base import adapters, read_adapters, write_adapters # noqa: F401
2
+
3
+ from .ascii import * # noqa: F401 F403
4
+ from .aws_athena import * # noqa: F401 F403
5
+ from .aws_dynamodb import * # noqa: F401 F403
6
+ from .aws_logs import * # noqa: F401 F403
7
+ from .gsheets import * # noqa: F401 F403
8
+ from .jc import * # noqa: F401 F403
9
+ from .jira import * # noqa: F401 F403
10
+ from .json import * # noqa: F401 F403
11
+ from .leveldb import * # noqa: F401 F403
12
+ from .nested_list import * # noqa: F401 F403
13
+ from .numbers import * # noqa: F401 F403
14
+ from .osquery import * # noqa: F401 F403
15
+ from .pandas_io import * # noqa: F401 F403
16
+ from .pcap import * # noqa: F401 F403
17
+ from .python import * # noqa: F401 F403
18
+ from .rdbms import * # noqa: F401 F403
19
+ from .smart_sheet import * # noqa: F401 F403
20
+ from .sql_literal import * # noqa: F401 F403
21
+ from .sumo_logic import * # noqa: F401 F403
22
+ from .text_array import * # noqa: F401 F403
23
+ from .yaml import * # noqa: F401 F403
24
+
25
+ # TODO: Register adapters in a cleaner way (dynamic adapter loading?). Just get rid of the `import *`.
@@ -0,0 +1,130 @@
1
+ from tableconv.adapters.df.base import Adapter, register_adapter
2
+ from tableconv.adapters.df.file_adapter_mixin import FileAdapterMixin
3
+
4
+
5
+ def _render_value(value):
6
+ if value is None:
7
+ return ""
8
+ return str(value).replace("\n", "\\n")
9
+
10
+
11
+ def _get_serialized_rows(rows):
12
+ return [{key: _render_value(value) for key, value in row.items()} for row in rows]
13
+
14
+
15
+ def _get_column_max_lengths(rows, column_names):
16
+ return {column: max([len(row[column]) for row in rows] + [len(column)]) for column in column_names}
17
+
18
+
19
+ def render_asciilite(ordered_fields, rows):
20
+ """Text table rendering inspired by sqlite CLI."""
21
+ output_lines = []
22
+ for row in rows:
23
+ sorted_values = [row[field] for field in ordered_fields]
24
+ serialized_value = [_render_value(value) for value in sorted_values]
25
+ output_lines.append("|".join(serialized_value))
26
+ return "\n".join(output_lines)
27
+
28
+
29
+ def render_unicodebox(ordered_fields, rows):
30
+ """Text table rendering inspired by ClickHouse."""
31
+ serialized_rows = _get_serialized_rows(rows)
32
+ max_lengths = _get_column_max_lengths(serialized_rows, ordered_fields)
33
+
34
+ output_lines = []
35
+ output_lines.append("┌─" + "─┬─".join([field.ljust(max_lengths[field], "─") for field in ordered_fields]) + "─┐")
36
+ for row in serialized_rows:
37
+ rendered_values_list = []
38
+ for field in ordered_fields:
39
+ rendered_values_list.append(row[field].ljust(max_lengths[field]))
40
+ output_lines.append("│ " + " │ ".join(rendered_values_list) + " │")
41
+ output_lines.append("└─" + "─┴─".join(["─" * max_lengths[field] for field in ordered_fields]) + "─┘")
42
+ return "\n".join(output_lines)
43
+
44
+
45
+ @register_adapter(
46
+ [
47
+ "ascii",
48
+ "asciiplain",
49
+ "asciisimple",
50
+ "asciigrid",
51
+ "asciifancygrid",
52
+ "asciipipe",
53
+ "asciipresto",
54
+ "asciipretty",
55
+ "asciipsql",
56
+ "asciilite",
57
+ "asciibox",
58
+ "mediawikiformat",
59
+ "moinmoinformat",
60
+ "jiraformat",
61
+ "markdown",
62
+ "md",
63
+ "rst",
64
+ "html",
65
+ "latex",
66
+ "tex",
67
+ ],
68
+ write_only=True,
69
+ )
70
+ class ASCIIAdapter(FileAdapterMixin, Adapter):
71
+ """
72
+ Adapter focused on outputting ascii art style table renderings, such as those found in database CLIs.
73
+ """
74
+
75
+ text_based = True
76
+
77
+ # @staticmethod
78
+ # def _transform_df(df):
79
+ # def transform(obj):
80
+ # if isinstance(obj, datetime.datetime):
81
+ # if obj.tzinfo is not None:
82
+ # obj = obj.astimezone(datetime.timezone.utc)
83
+ # # Warning: Interpret naive TS as being UTC.
84
+ # return obj.strftime('%Y-%m-%d %H:%M:%S')
85
+ # elif isinstance(obj, list) or isinstance(obj, dict):
86
+ # return str(obj)
87
+ # return obj
88
+ # df = df.applymap(transform)
89
+ # df = df.replace({np.nan: None})
90
+
91
+ @staticmethod
92
+ def get_example_url(scheme):
93
+ return f"{scheme}:-"
94
+
95
+ @staticmethod
96
+ def dump_text_data(df, scheme, params):
97
+ TABULATE_TABLEFMT = {
98
+ "ascii": "simple",
99
+ "asciiplain": "plain",
100
+ "asciisimple": "simple",
101
+ "md": "github",
102
+ "markdown": "github",
103
+ "asciigrid": "grid",
104
+ "asciifancygrid": "fancy_grid",
105
+ "asciipipe": "pipe",
106
+ "asciipresto": "presto",
107
+ "asciipretty": "pretty",
108
+ "asciipsql": "psql",
109
+ "mediawikiformat": "mediawiki",
110
+ "moinmoinformat": "moinmoin",
111
+ "jiraformat": "jira",
112
+ "rst": "rst",
113
+ "latex": "latex",
114
+ "tex": "latex",
115
+ }
116
+ if scheme in TABULATE_TABLEFMT:
117
+ from tabulate import tabulate
118
+
119
+ return tabulate(
120
+ df.values.tolist(),
121
+ list(df.columns),
122
+ tablefmt=TABULATE_TABLEFMT[scheme],
123
+ disable_numparse=True,
124
+ )
125
+ elif scheme == "asciilite":
126
+ return render_asciilite(list(df.columns), df.to_dict("records"))
127
+ elif scheme == "asciibox":
128
+ return render_unicodebox(list(df.columns), df.to_dict("records"))
129
+ else:
130
+ raise AssertionError()
@@ -0,0 +1,333 @@
1
+ import contextlib
2
+ import datetime
3
+ import logging
4
+ import os
5
+ import tempfile
6
+ import textwrap
7
+ import time
8
+ import uuid
9
+
10
+ from tableconv.adapters.df.base import Adapter, register_adapter
11
+ from tableconv.adapters.df.pandas_io import CSVAdapter, ParquetAdapter
12
+ from tableconv.exceptions import (
13
+ AppendSchemeConflictError,
14
+ InvalidParamsError,
15
+ InvalidQueryError,
16
+ TableAlreadyExistsError,
17
+ )
18
+ from tableconv.uri import parse_uri
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ FORMAT_SQL_MAPPING = {
24
+ "parquet": textwrap.dedent(
25
+ """
26
+ ROW FORMAT SERDE
27
+ 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
28
+ STORED AS INPUTFORMAT
29
+ 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
30
+ OUTPUTFORMAT
31
+ 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
32
+ """
33
+ ),
34
+ "csv": textwrap.dedent(
35
+ """
36
+ ROW FORMAT DELIMITED
37
+ FIELDS TERMINATED BY ','
38
+ STORED AS INPUTFORMAT
39
+ 'org.apache.hadoop.mapred.TextInputFormat'
40
+ OUTPUTFORMAT
41
+ 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
42
+ """
43
+ ),
44
+ }
45
+
46
+
47
+ @register_adapter(["awsathena"])
48
+ class AWSAthenaAdapter(Adapter):
49
+ @staticmethod
50
+ def get_example_url(scheme):
51
+ return f"{scheme}://eu-central-1"
52
+
53
+ @staticmethod
54
+ def load(uri, query):
55
+ uri = parse_uri(uri)
56
+ aws_region = uri.authority
57
+
58
+ return AWSAthenaAdapter._run_athena_query(
59
+ query=query, aws_region=aws_region, catalog="AwsDataCatalog", database=None, return_results_df=True
60
+ )
61
+
62
+ @staticmethod
63
+ def _run_athena_query(
64
+ query, aws_region, catalog, database, return_results_raw=False, return_results_df=False, athena_client=None
65
+ ):
66
+ import boto3
67
+
68
+ if not athena_client:
69
+ athena_client = boto3.client("athena", region_name=aws_region)
70
+ sts = boto3.client("sts", region_name=aws_region)
71
+ s3 = boto3.client("s3", region_name=aws_region)
72
+
73
+ aws_account_id = sts.get_caller_identity()["Account"]
74
+ output_s3_bucket = f"aws-athena-query-results-{aws_account_id}-{aws_region}"
75
+
76
+ logger.debug(f"Querying.. aws_region={aws_region}, catalog={catalog}, output_s3_bucket={output_s3_bucket}.")
77
+ query_execution_context = {"Catalog": catalog}
78
+ if database:
79
+ query_execution_context["Database"] = database
80
+ query_req_resp = athena_client.start_query_execution(
81
+ QueryString=query,
82
+ QueryExecutionContext=query_execution_context,
83
+ WorkGroup="primary",
84
+ ResultConfiguration={"OutputLocation": f"s3://{output_s3_bucket}/"},
85
+ )
86
+ query_execution_id = query_req_resp["QueryExecutionId"]
87
+ logger.info(f"Waiting for AWS Athena query {query_execution_id}...")
88
+
89
+ while True:
90
+ details = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
91
+ status = details["QueryExecution"]["Status"]["State"]
92
+ if status in ("FAILED", "CANCELLED"):
93
+ error_message = details["QueryExecution"]["Status"].get("StateChangeReason")
94
+ raise InvalidQueryError(f"AWS Athena Query {status.lower()}: {error_message}")
95
+ elif status == "SUCCEEDED":
96
+ break
97
+ else:
98
+ POLL_INTERVAL = datetime.timedelta(seconds=2)
99
+ time.sleep(POLL_INTERVAL.total_seconds())
100
+
101
+ if return_results_raw:
102
+ response = athena_client.get_query_results(QueryExecutionId=query_execution_id)
103
+ return response["ResultSet"]
104
+ if return_results_df:
105
+ output_s3_key = f"{query_execution_id}.csv"
106
+ local_filename = f"/tmp/awsathena-{query_execution_id}.csv"
107
+ try:
108
+ s3.download_file(output_s3_bucket, output_s3_key, local_filename)
109
+ df = CSVAdapter.load(f"csv://{local_filename}", None)
110
+ finally:
111
+ with contextlib.suppress(FileNotFoundError):
112
+ os.remove(local_filename)
113
+ return df
114
+
115
+ @staticmethod
116
+ def _get_json_schema(df):
117
+ """
118
+ TODO: Update aws_athena to not be a df-level adapter, and instead use IntermediateExchangeTable, so we don't
119
+ have to duplicate this logic.
120
+ """
121
+ from genson import SchemaBuilder
122
+
123
+ builder = SchemaBuilder()
124
+ builder.add_schema({"type": "object", "properties": {}})
125
+ for row in df.to_dict(orient="records"):
126
+ builder.add_object(row)
127
+ return builder.to_schema()
128
+
129
+ @staticmethod
130
+ def resolve_presto_type(json_schema, column_name=None, top_level=False):
131
+ presto_types = set()
132
+ if "type" in json_schema:
133
+ if isinstance(json_schema["type"], str):
134
+ json_types = {json_schema["type"]}
135
+ else:
136
+ assert isinstance(json_schema["type"], list)
137
+ json_types = json_schema["type"]
138
+ for json_type in json_types:
139
+ if json_type == "array":
140
+ if "items" in json_schema:
141
+ array_type = AWSAthenaAdapter.resolve_presto_type(json_schema["items"])
142
+ else:
143
+ array_type = "string"
144
+ presto_types.add(f"array<{array_type}>")
145
+ elif json_type == "null":
146
+ pass
147
+ else:
148
+ presto_types.add(
149
+ {
150
+ "integer": "bigint",
151
+ "string": "string",
152
+ "boolean": "boolean",
153
+ "number": "double",
154
+ "object": "string",
155
+ }[json_type]
156
+ )
157
+ else:
158
+ assert "anyOf" in json_schema
159
+ for sub_definition in json_schema["anyOf"]:
160
+ presto_types.add(AWSAthenaAdapter.resolve_presto_type(sub_definition))
161
+ if "null" in presto_types and presto_types != {"null"}:
162
+ presto_types.remove("null")
163
+ if "double" in presto_types and presto_types != {"double"}:
164
+ # hide NaN corruption added by pandas (pandas converts nulls to NaN, which then cause the column to get
165
+ # misidentified (or so I argue) as containing doubles)
166
+ presto_types.remove("double")
167
+ if len(presto_types) > 1:
168
+ if top_level:
169
+ logger.warning(
170
+ f"Identified multiple conflicting types for {column_name}: {presto_types}. Picking one "
171
+ + "arbitrarily."
172
+ )
173
+ presto_types = {presto_types.pop()}
174
+ if len(presto_types) == 0:
175
+ if top_level:
176
+ logger.warning(f"Unable to identify type of column {column_name}. Picking string.")
177
+ presto_types = {"string"}
178
+ return presto_types.pop()
179
+
180
+ @staticmethod
181
+ def _gen_schema(df, data_format, table_name, s3_base_url):
182
+ schema = f"CREATE EXTERNAL TABLE `{table_name}` (\n"
183
+ field_schema_lines = []
184
+ columns = []
185
+ for column, json_schema in AWSAthenaAdapter._get_json_schema(df)["properties"].items():
186
+ presto_type = AWSAthenaAdapter.resolve_presto_type(json_schema, column_name=column, top_level=True)
187
+ field_schema_lines.append(f" `{column}` {presto_type}")
188
+ columns.append(column)
189
+ schema += ",\n".join(field_schema_lines)
190
+ schema += "\n)"
191
+ schema += FORMAT_SQL_MAPPING[data_format]
192
+ schema += f"LOCATION\n '{s3_base_url}'"
193
+ return schema, columns
194
+
195
+ @staticmethod
196
+ def dump(df, uri):
197
+ import boto3
198
+
199
+ uri = parse_uri(uri)
200
+ if "if_exists" in uri.query:
201
+ if_exists = uri.query["if_exists"]
202
+ elif "append" in uri.query and uri.query["append"].lower() != "false":
203
+ if_exists = "append"
204
+ elif "overwrite" in uri.query and uri.query["overwrite"].lower() != "false":
205
+ if_exists = "replace"
206
+ else:
207
+ if_exists = "fail"
208
+
209
+ if if_exists not in ("replace", "append", "fail"):
210
+ raise InvalidParamsError("valid values for if_exists are replace, append, or fail (default)")
211
+
212
+ aws_region = uri.authority
213
+
214
+ athena_client = boto3.client("athena", region_name=aws_region)
215
+ catalog = "AwsDataCatalog"
216
+ database = uri.path.strip("/")
217
+ table_name = uri.query["table"]
218
+
219
+ s3_bucket_path = uri.query["s3_bucket_path"]
220
+ s3_bucket_path_split = os.path.split(s3_bucket_path)
221
+ if s3_bucket_path_split[0]:
222
+ s3_bucket = s3_bucket_path_split[0]
223
+ s3_bucket_prefix = os.path.join(s3_bucket_path_split[1], table_name)
224
+ else:
225
+ s3_bucket = s3_bucket_path_split[1]
226
+ s3_bucket_prefix = table_name
227
+ data_format = uri.query["data_format"]
228
+ if data_format not in FORMAT_SQL_MAPPING:
229
+ raise InvalidParamsError(f"Only formats {FORMAT_SQL_MAPPING.keys()} supported")
230
+ s3_base_url = f"s3://{os.path.join(s3_bucket, s3_bucket_prefix)}"
231
+
232
+ with tempfile.TemporaryDirectory() as temp_dir:
233
+ # Dump to temp file on disk
234
+ filename = f"{uuid.uuid4()}.{data_format}"
235
+ temp_file_path = os.path.join(temp_dir, filename)
236
+ if data_format == "csv":
237
+ CSVAdapter.dump(df, uri=temp_file_path)
238
+ elif data_format == "parquet":
239
+ ParquetAdapter.dump(df, uri=temp_file_path)
240
+ else:
241
+ raise AssertionError
242
+
243
+ # Manage Table DDL
244
+ schema_ddl, columns = AWSAthenaAdapter._gen_schema(df, data_format, table_name, s3_base_url)
245
+
246
+ try:
247
+ table_metadata = athena_client.get_table_metadata(
248
+ CatalogName=catalog, DatabaseName=database, TableName=table_name
249
+ )
250
+ table_exists = True
251
+ except athena_client.exceptions.MetadataException as exc:
252
+ if "EntityNotFoundException" in exc.response["Error"]["Message"]:
253
+ table_exists = False
254
+ else:
255
+ raise
256
+ if table_exists:
257
+ if if_exists == "fail":
258
+ raise TableAlreadyExistsError(f"{database}{table_name} already exists")
259
+ elif if_exists == "append":
260
+ pre_existing_columns = [col["Name"] for col in table_metadata["TableMetadata"]["Columns"]]
261
+ if not pre_existing_columns == columns:
262
+ raise AppendSchemeConflictError("Cannot append to existing table - schema mismatch")
263
+ pre_existing_s3_base_url = table_metadata["TableMetadata"]["Parameters"]["location"].strip("/")
264
+ if pre_existing_s3_base_url != s3_base_url.strip("/"):
265
+ existing_uri = parse_uri(pre_existing_s3_base_url)
266
+ existing_bucket = existing_uri.authority
267
+ existing_prefix = existing_uri.path.strip("/")
268
+ if existing_bucket != s3_bucket:
269
+ raise AppendSchemeConflictError(
270
+ "Cannot append to existing table - s3 bucket mismatch "
271
+ + f"(pre-existing location is {pre_existing_s3_base_url})"
272
+ )
273
+ if existing_prefix.startswith(s3_bucket_prefix):
274
+ # Discovered prefix is more restrictive than our requested prefix - this is safe, we can
275
+ # just adopt it.
276
+ logger.warning(f"Appending to found pre-existing prefix at {existing_prefix}")
277
+ s3_bucket_prefix = existing_prefix
278
+ s3_base_url = f"s3://{os.path.join(s3_bucket, s3_bucket_prefix)}"
279
+ schema_ddl = None # Invalidate the now-outdated schema
280
+ elif if_exists == "replace":
281
+ s3_bucket_prefix = os.path.join(s3_bucket_prefix, str(uuid.uuid4()))
282
+ s3_base_url = f"s3://{os.path.join(s3_bucket, s3_bucket_prefix)}"
283
+ schema_ddl, _ = AWSAthenaAdapter._gen_schema(df, data_format, table_name, s3_base_url)
284
+ logger.warning(
285
+ f"Deleting table definition for {database}.{table_name}. Leaving old data behind and changing "
286
+ + f"prefix to {s3_bucket_prefix}/."
287
+ )
288
+ assert s3_base_url and schema_ddl and s3_bucket_prefix # safety check
289
+ old_table_schema_query_result = AWSAthenaAdapter._run_athena_query(
290
+ query=f"SHOW CREATE TABLE `{table_name}`",
291
+ return_results_raw=True,
292
+ aws_region=aws_region,
293
+ catalog="AwsDataCatalog",
294
+ database=database,
295
+ athena_client=athena_client,
296
+ )
297
+ old_table_schema = "\n".join(
298
+ [x["Data"][0]["VarCharValue"] for x in old_table_schema_query_result["Rows"]]
299
+ )
300
+ logger.debug(
301
+ "Backup of old table schema before deleting:\n" + textwrap.indent(old_table_schema, " ")
302
+ )
303
+ AWSAthenaAdapter._run_athena_query(
304
+ query=f"DROP TABLE `{table_name}`",
305
+ aws_region=aws_region,
306
+ catalog="AwsDataCatalog",
307
+ database=database,
308
+ athena_client=athena_client,
309
+ )
310
+ # It's just too dangerous to actually delete any data; commented out.
311
+ # s3 = boto3.resource('s3')
312
+ # bucket = s3.Bucket(s3_bucket)
313
+ # bucket.objects.filter(Prefix=f'{s3_bucket_prefix}/').delete()
314
+ table_exists = False
315
+ else:
316
+ raise AssertionError
317
+ if not table_exists:
318
+ logger.info(f"Creating new table {database}.{table_name} in {aws_region}")
319
+ logger.debug("\n" + textwrap.indent(schema_ddl, " "))
320
+ AWSAthenaAdapter._run_athena_query(
321
+ query=schema_ddl,
322
+ return_results_raw=True,
323
+ aws_region=aws_region,
324
+ catalog="AwsDataCatalog",
325
+ database=database,
326
+ athena_client=athena_client,
327
+ )
328
+
329
+ # Upload temp file to s3
330
+ s3_client = boto3.client("s3")
331
+ s3_object_key = os.path.join(s3_bucket_prefix, filename)
332
+ logger.info(f"Uploading data to s3://{os.path.join(s3_bucket, s3_object_key)}")
333
+ s3_client.upload_file(temp_file_path, s3_bucket, s3_object_key)
@@ -0,0 +1,37 @@
1
+ import logging
2
+
3
+ import pandas as pd
4
+
5
+ from tableconv.adapters.df.base import Adapter, register_adapter
6
+ from tableconv.uri import parse_uri
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ @register_adapter(["awsdynamodb"], read_only=True)
12
+ class AWSDynamoDB(Adapter):
13
+ @staticmethod
14
+ def get_example_url(scheme):
15
+ return f"{scheme}://eu-central-1/example_table"
16
+
17
+ @staticmethod
18
+ def load(uri, query):
19
+ import boto3
20
+
21
+ uri = parse_uri(uri)
22
+ aws_region = uri.authority
23
+ table_name = uri.path.strip("/")
24
+
25
+ dynamodb = boto3.client("dynamodb", region_name=aws_region)
26
+
27
+ if query:
28
+ result = dynamodb.execute_statement(Statement=query)
29
+ raw_array = result["Items"]
30
+ else:
31
+ logger.info("Sequentially querying DynamoDB scan results...")
32
+ scan_results = dynamodb.get_paginator("scan").paginate(TableName=table_name)
33
+ raw_array = []
34
+ for response in scan_results:
35
+ raw_array.extend(response["Items"])
36
+
37
+ return pd.DataFrame.from_records(raw_array)
@@ -0,0 +1,68 @@
1
+ import contextlib
2
+ import datetime
3
+ import logging
4
+ import time
5
+
6
+ import pandas as pd
7
+
8
+ from tableconv.adapters.df.base import Adapter, register_adapter
9
+ from tableconv.exceptions import InvalidQueryError
10
+ from tableconv.parse_time import parse_input_time
11
+ from tableconv.uri import parse_uri
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @register_adapter(["awslogs"], read_only=True)
17
+ class AWSLogs(Adapter):
18
+ """AWS Cloudwatch Logs (Disclaimer: Only supports Logs Insights queries for now)"""
19
+
20
+ @staticmethod
21
+ def get_example_url(scheme):
22
+ return f"{scheme}://eu-central-1//aws/lambda/example-function"
23
+
24
+ @staticmethod
25
+ def load(uri, query):
26
+ import boto3
27
+
28
+ uri = parse_uri(uri)
29
+ aws_region = uri.authority
30
+
31
+ from_time = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=1)
32
+ to_time = datetime.datetime.now(tz=datetime.timezone.utc)
33
+ if "from" in uri.query:
34
+ from_time = parse_input_time(uri.query["from"])
35
+ if "to" in uri.query:
36
+ to_time = parse_input_time(uri.query["to"])
37
+ client = boto3.client("logs", region_name=aws_region)
38
+
39
+ path = uri.path
40
+ if path[0] == "/":
41
+ path = path[1:]
42
+
43
+ query_id = client.start_query(
44
+ logGroupName=path,
45
+ startTime=int(from_time.timestamp()),
46
+ endTime=int(to_time.timestamp()),
47
+ queryString=query,
48
+ limit=int(uri.query.get("limit", 1000)),
49
+ )["queryId"]
50
+
51
+ try:
52
+ while True:
53
+ results = client.get_query_results(queryId=query_id)
54
+ if results["status"] in ("Failed", "Timeout", "Unknown"):
55
+ raise InvalidQueryError(f'AWS CloudWatch Logs Insights Query {results["status"]}.')
56
+ elif results["status"] == "Complete":
57
+ raw_array = [{item["field"]: item["value"] for item in row} for row in results["results"]]
58
+ break
59
+ else:
60
+ assert results["status"] in ("Running", "Scheduled")
61
+ POLL_INTERVAL = datetime.timedelta(seconds=2)
62
+ time.sleep(POLL_INTERVAL.total_seconds())
63
+ except Exception as exc:
64
+ with contextlib.suppress(Exception):
65
+ client.stop_query(query_id)
66
+ raise exc
67
+
68
+ return pd.DataFrame.from_records(raw_array)