tinybird 0.0.1.dev5__py3-none-any.whl → 0.0.1.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. tinybird/__cli__.py +7 -8
  2. tinybird/tb/cli.py +28 -0
  3. tinybird/{tb_cli_modules → tb/modules}/auth.py +5 -5
  4. tinybird/{tb_cli_modules → tb/modules}/branch.py +5 -25
  5. tinybird/{tb_cli_modules → tb/modules}/build.py +10 -21
  6. tinybird/tb/modules/cicd.py +271 -0
  7. tinybird/{tb_cli_modules → tb/modules}/cli.py +20 -140
  8. tinybird/tb/modules/common.py +2110 -0
  9. tinybird/tb/modules/config.py +352 -0
  10. tinybird/{tb_cli_modules → tb/modules}/connection.py +4 -4
  11. tinybird/{tb_cli_modules → tb/modules}/create.py +20 -20
  12. tinybird/tb/modules/datafile/build.py +2103 -0
  13. tinybird/tb/modules/datafile/build_common.py +118 -0
  14. tinybird/tb/modules/datafile/build_datasource.py +403 -0
  15. tinybird/tb/modules/datafile/build_pipe.py +648 -0
  16. tinybird/tb/modules/datafile/common.py +897 -0
  17. tinybird/tb/modules/datafile/diff.py +197 -0
  18. tinybird/tb/modules/datafile/exceptions.py +23 -0
  19. tinybird/tb/modules/datafile/format_common.py +66 -0
  20. tinybird/tb/modules/datafile/format_datasource.py +160 -0
  21. tinybird/tb/modules/datafile/format_pipe.py +195 -0
  22. tinybird/tb/modules/datafile/parse_datasource.py +41 -0
  23. tinybird/tb/modules/datafile/parse_pipe.py +69 -0
  24. tinybird/tb/modules/datafile/pipe_checker.py +560 -0
  25. tinybird/tb/modules/datafile/pull.py +157 -0
  26. tinybird/{tb_cli_modules → tb/modules}/datasource.py +7 -6
  27. tinybird/tb/modules/exceptions.py +91 -0
  28. tinybird/{tb_cli_modules → tb/modules}/fmt.py +6 -3
  29. tinybird/{tb_cli_modules → tb/modules}/job.py +3 -3
  30. tinybird/{tb_cli_modules → tb/modules}/llm.py +1 -1
  31. tinybird/{tb_cli_modules → tb/modules}/local.py +9 -5
  32. tinybird/{tb_cli_modules → tb/modules}/mock.py +5 -5
  33. tinybird/{tb_cli_modules → tb/modules}/pipe.py +11 -5
  34. tinybird/{tb_cli_modules → tb/modules}/prompts.py +1 -1
  35. tinybird/tb/modules/regions.py +9 -0
  36. tinybird/{tb_cli_modules → tb/modules}/tag.py +2 -2
  37. tinybird/tb/modules/telemetry.py +310 -0
  38. tinybird/{tb_cli_modules → tb/modules}/test.py +5 -5
  39. tinybird/{tb_cli_modules → tb/modules}/tinyunit/tinyunit.py +1 -1
  40. tinybird/{tb_cli_modules → tb/modules}/token.py +3 -3
  41. tinybird/{tb_cli_modules → tb/modules}/workspace.py +5 -5
  42. tinybird/{tb_cli_modules → tb/modules}/workspace_members.py +4 -4
  43. tinybird/tb_cli_modules/common.py +9 -25
  44. tinybird/tb_cli_modules/config.py +0 -8
  45. {tinybird-0.0.1.dev5.dist-info → tinybird-0.0.1.dev7.dist-info}/METADATA +1 -1
  46. tinybird-0.0.1.dev7.dist-info/RECORD +71 -0
  47. tinybird-0.0.1.dev7.dist-info/entry_points.txt +2 -0
  48. tinybird/datafile.py +0 -6123
  49. tinybird/tb_cli.py +0 -28
  50. tinybird-0.0.1.dev5.dist-info/RECORD +0 -52
  51. tinybird-0.0.1.dev5.dist-info/entry_points.txt +0 -2
  52. /tinybird/{tb_cli_modules → tb/modules}/table.py +0 -0
  53. /tinybird/{tb_cli_modules → tb/modules}/tinyunit/tinyunit_lib.py +0 -0
  54. {tinybird-0.0.1.dev5.dist-info → tinybird-0.0.1.dev7.dist-info}/WHEEL +0 -0
  55. {tinybird-0.0.1.dev5.dist-info → tinybird-0.0.1.dev7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,897 @@
1
+ import glob
2
+ import itertools
3
+ import os
4
+ import os.path
5
+ import pprint
6
+ import re
7
+ import shlex
8
+ import textwrap
9
+ import traceback
10
+ from collections import namedtuple
11
+ from io import StringIO
12
+ from pathlib import Path
13
+ from string import Template
14
+ from typing import Any, Callable, Dict, List, Optional, Tuple, cast
15
+
16
+ import click
17
+ from mypy_extensions import KwArg, VarArg
18
+
19
+ from tinybird.ch_utils.engine import ENABLED_ENGINES
20
+ from tinybird.feedback_manager import FeedbackManager
21
+ from tinybird.sql import parse_indexes_structure, parse_table_structure, schema_to_sql_columns
22
+ from tinybird.tb.modules.datafile.exceptions import IncludeFileNotFoundException, ParseException, ValidationException
23
+ from tinybird.tb.modules.exceptions import CLIPipeException
24
+
25
+
26
+ class PipeTypes:
27
+ MATERIALIZED = "materialized"
28
+ ENDPOINT = "endpoint"
29
+ COPY = "copy"
30
+ DATA_SINK = "sink"
31
+ STREAM = "stream"
32
+ DEFAULT = "default"
33
+
34
+
35
+ class PipeNodeTypes:
36
+ MATERIALIZED = "materialized"
37
+ ENDPOINT = "endpoint"
38
+ STANDARD = "standard"
39
+ DEFAULT = "default"
40
+ DATA_SINK = "sink"
41
+ COPY = "copy"
42
+ STREAM = "stream"
43
+
44
+
45
+ class DataFileExtensions:
46
+ PIPE = ".pipe"
47
+ DATASOURCE = ".datasource"
48
+ INCL = ".incl"
49
+
50
+
51
+ class CopyModes:
52
+ APPEND = "append"
53
+ REPLACE = "replace"
54
+
55
+ valid_modes = (APPEND, REPLACE)
56
+
57
+ @staticmethod
58
+ def is_valid(node_mode):
59
+ return node_mode.lower() in CopyModes.valid_modes
60
+
61
+
62
+ class CopyParameters:
63
+ TARGET_DATASOURCE = "target_datasource"
64
+ COPY_SCHEDULE = "copy_schedule"
65
+ COPY_MODE = "copy_mode"
66
+
67
+
68
+ DATAFILE_NEW_LINE = "\n"
69
+ DATAFILE_INDENT = " " * 4
70
+
71
+ ON_DEMAND = "@on-demand"
72
+ DEFAULT_CRON_PERIOD: int = 60
73
+
74
+ INTERNAL_TABLES: Tuple[str, ...] = (
75
+ "datasources_ops_log",
76
+ "pipe_stats",
77
+ "pipe_stats_rt",
78
+ "block_log",
79
+ "data_connectors_log",
80
+ "kafka_ops_log",
81
+ "datasources_storage",
82
+ "endpoint_errors",
83
+ "bi_stats_rt",
84
+ "bi_stats",
85
+ )
86
+
87
+ PREVIEW_CONNECTOR_SERVICES = ["s3", "s3_iamrole", "gcs"]
88
+
89
+ pp = pprint.PrettyPrinter()
90
+
91
+
92
+ class Datafile:
93
+ def __init__(self) -> None:
94
+ self.maintainer: Optional[str] = None
95
+ self.sources: List[str] = []
96
+ self.nodes: List[Dict[str, Any]] = []
97
+ self.tokens: List[Dict[str, Any]] = []
98
+ self.version: Optional[int] = None
99
+ self.description: Optional[str] = None
100
+ self.raw: Optional[List[str]] = None
101
+ self.includes: Dict[str, Any] = {}
102
+ self.shared_with: List[str] = []
103
+ self.warnings: List[str] = []
104
+ self.filtering_tags: Optional[List[str]] = None
105
+
106
+ def validate(self) -> None:
107
+ for x in self.nodes:
108
+ if not x["name"].strip():
109
+ raise ValidationException("invalid node name, can't be empty")
110
+ if "sql" not in x:
111
+ raise ValidationException("node %s must have a SQL query" % x["name"])
112
+ if self.version is not None and (not isinstance(self.version, int) or self.version < 0):
113
+ raise ValidationException("version must be a positive integer")
114
+
115
+ def is_equal(self, other):
116
+ if len(self.nodes) != len(other.nodes):
117
+ return False
118
+
119
+ return all(self.nodes[i] == other.nodes[i] for i, _ in enumerate(self.nodes))
120
+
121
+
122
+ def format_filename(filename: str, hide_folders: bool = False):
123
+ return os.path.basename(filename) if hide_folders else filename
124
+
125
+
126
+ def _unquote(x: str):
127
+ QUOTES = ('"', "'")
128
+ if x[0] in QUOTES and x[-1] in QUOTES:
129
+ x = x[1:-1]
130
+ return x
131
+
132
+
133
+ def eval_var(s: str, skip: bool = False) -> str:
134
+ if skip:
135
+ return s
136
+ # replace ENV variables
137
+ # it's probably a bad idea to allow to get any env var
138
+ return Template(s).safe_substitute(os.environ)
139
+
140
+
141
+ def parse_tags(tags: str) -> Tuple[str, List[str]]:
142
+ """
143
+ Parses a string of tags into:
144
+ - kv_tags: a string of key-value tags: the previous tags we have for operational purposes. It
145
+ has the format key=value&key2=value2 (with_staging=true&with_last_date=true)
146
+ - filtering_tags: a list of tags that are used for filtering.
147
+
148
+ Example: "with_staging=true&with_last_date=true,billing,stats" ->
149
+ kv_tags = {"with_staging": "true", "with_last_date": "true"}
150
+ filtering_tags = ["billing", "stats"]
151
+ """
152
+ kv_tags = []
153
+ filtering_tags = []
154
+
155
+ entries = tags.split(",")
156
+ for entry in entries:
157
+ trimmed_entry = entry.strip()
158
+ if "=" in trimmed_entry:
159
+ kv_tags.append(trimmed_entry)
160
+ else:
161
+ filtering_tags.append(trimmed_entry)
162
+
163
+ all_kv_tags = "&".join(kv_tags)
164
+
165
+ return all_kv_tags, filtering_tags
166
+
167
+
168
+ def parse(
169
+ s: str,
170
+ default_node: Optional[str] = None,
171
+ basepath: str = ".",
172
+ replace_includes: bool = True,
173
+ skip_eval: bool = False,
174
+ ) -> Datafile:
175
+ """
176
+ Parses `s` string into a document
177
+ >>> d = parse("FROM SCRATCH\\nSOURCE 'https://example.com'\\n#this is a comment\\nMAINTAINER 'rambo' #this is me\\nNODE \\"test_01\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_00\\n\\n\\nNODE \\"test_02\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_01\\n WHERE a > 1\\n GROUP by a\\n")
178
+ >>> d.maintainer
179
+ 'rambo'
180
+ >>> d.sources
181
+ ['https://example.com']
182
+ >>> len(d.nodes)
183
+ 2
184
+ >>> d.nodes[0]
185
+ {'name': 'test_01', 'description': 'this is a node that does whatever', 'sql': 'SELECT * from test_00'}
186
+ >>> d.nodes[1]
187
+ {'name': 'test_02', 'description': 'this is a node that does whatever', 'sql': 'SELECT * from test_01\\nWHERE a > 1\\nGROUP by a'}
188
+ """
189
+ lines = list(StringIO(s, newline=None))
190
+
191
+ doc = Datafile()
192
+ doc.raw = list(StringIO(s, newline=None))
193
+
194
+ parser_state = namedtuple("parser_state", ["multiline", "current_node", "command", "multiline_string", "is_sql"])
195
+
196
+ parser_state.multiline = False
197
+ parser_state.current_node = False
198
+
199
+ def assign(attr):
200
+ def _fn(x, **kwargs):
201
+ setattr(doc, attr, _unquote(x))
202
+
203
+ return _fn
204
+
205
+ def schema(*args, **kwargs):
206
+ s = _unquote("".join(args))
207
+ try:
208
+ sh = parse_table_structure(s)
209
+ except Exception as e:
210
+ raise ParseException(FeedbackManager.error_parsing_schema(line=kwargs["lineno"], error=e))
211
+
212
+ parser_state.current_node["schema"] = ",".join(schema_to_sql_columns(sh))
213
+ parser_state.current_node["columns"] = sh
214
+
215
+ def indexes(*args, **kwargs):
216
+ s = _unquote("".join(args))
217
+ if not s:
218
+ return
219
+ try:
220
+ indexes = parse_indexes_structure(s.splitlines())
221
+ except Exception as e:
222
+ raise ParseException(FeedbackManager.error_parsing_indices(line=kwargs["lineno"], error=e))
223
+
224
+ parser_state.current_node["indexes"] = indexes
225
+
226
+ def assign_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
227
+ def _f(*args: str, **kwargs: Any):
228
+ s = _unquote((" ".join(args)).strip())
229
+ parser_state.current_node[v.lower()] = eval_var(s, skip=skip_eval)
230
+
231
+ return _f
232
+
233
+ def sources(x: str, **kwargs: Any) -> None:
234
+ doc.sources.append(_unquote(x))
235
+
236
+ def node(*args: str, **kwargs: Any) -> None:
237
+ node = {"name": eval_var(_unquote(args[0]))}
238
+ doc.nodes.append(node)
239
+ parser_state.current_node = node
240
+
241
+ def scope(*args: str, **kwargs: Any) -> None:
242
+ scope = {"name": eval_var(_unquote(args[0]))}
243
+ doc.nodes.append(scope)
244
+ parser_state.current_node = scope
245
+
246
+ def description(*args: str, **kwargs: Any) -> None:
247
+ description = (" ".join(args)).strip()
248
+
249
+ if parser_state.current_node:
250
+ parser_state.current_node["description"] = description
251
+ if parser_state.current_node.get("name", "") == "default":
252
+ doc.description = description
253
+ else:
254
+ doc.description = description
255
+
256
+ def sql(var_name: str, **kwargs: Any) -> Callable[[str, KwArg(Any)], None]:
257
+ def _f(sql: str, **kwargs: Any) -> None:
258
+ if not parser_state.current_node:
259
+ raise ParseException("SQL must be called after a NODE command")
260
+ parser_state.current_node[var_name] = (
261
+ textwrap.dedent(sql).rstrip() if "%" not in sql.strip()[0] else sql.strip()
262
+ )
263
+
264
+ # HACK this cast is needed because Mypy
265
+ return cast(Callable[[str, KwArg(Any)], None], _f)
266
+
267
+ def assign_node_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
268
+ def _f(*args: str, **kwargs: Any) -> None:
269
+ if not parser_state.current_node:
270
+ raise ParseException("%s must be called after a NODE command" % v)
271
+ return assign_var(v)(*args, **kwargs)
272
+
273
+ return _f
274
+
275
+ def add_token(*args: str, **kwargs: Any) -> None: # token_name, permissions):
276
+ if len(args) < 2:
277
+ raise ParseException('TOKEN gets two params, token name and permissions e.g TOKEN "read api token" READ')
278
+ doc.tokens.append({"token_name": _unquote(args[0]), "permissions": args[1]})
279
+
280
+ def test(*args: str, **kwargs: Any) -> None:
281
+ # TODO: Should be removed?
282
+ print("test", args, kwargs) # noqa: T201
283
+
284
+ def include(*args: str, **kwargs: Any) -> None:
285
+ f = _unquote(args[0])
286
+ f = eval_var(f)
287
+ attrs = dict(_unquote(x).split("=", 1) for x in args[1:])
288
+ nonlocal lines
289
+ lineno = kwargs["lineno"]
290
+ replace_includes = kwargs["replace_includes"]
291
+ n = lineno
292
+ args_with_attrs = " ".join(args)
293
+
294
+ try:
295
+ while True:
296
+ n += 1
297
+ if len(lines) <= n:
298
+ break
299
+ if "NODE" in lines[n]:
300
+ doc.includes[args_with_attrs] = lines[n]
301
+ break
302
+ if args_with_attrs not in doc.includes:
303
+ doc.includes[args_with_attrs] = ""
304
+ except Exception:
305
+ pass
306
+
307
+ # If this parse was triggered by format, we don't want to replace the file
308
+ if not replace_includes:
309
+ return
310
+
311
+ # be sure to replace the include line
312
+ p = Path(basepath)
313
+
314
+ try:
315
+ with open(p / f) as file:
316
+ try:
317
+ ll = list(StringIO(file.read(), newline=None))
318
+ node_line = [line for line in ll if "NODE" in line]
319
+ if node_line and doc.includes[args_with_attrs]:
320
+ doc.includes[node_line[0].split("NODE")[-1].split("\n")[0].strip()] = ""
321
+ except Exception:
322
+ pass
323
+ finally:
324
+ file.seek(0)
325
+ lines[lineno : lineno + 1] = [
326
+ "",
327
+ *list(StringIO(Template(file.read()).safe_substitute(attrs), newline=None)),
328
+ ]
329
+ except FileNotFoundError:
330
+ raise IncludeFileNotFoundException(f, lineno)
331
+
332
+ def version(*args: str, **kwargs: Any) -> None:
333
+ if len(args) < 1:
334
+ raise ParseException("VERSION gets one positive integer param")
335
+ try:
336
+ version = int(args[0])
337
+ if version < 0:
338
+ raise ValidationException("version must be a positive integer e.g VERSION 2")
339
+ doc.version = version
340
+ except ValueError:
341
+ raise ValidationException("version must be a positive integer e.g VERSION 2")
342
+
343
+ def shared_with(*args: str, **kwargs: Any) -> None:
344
+ for entries in args:
345
+ # In case they specify multiple workspaces
346
+ doc.shared_with += [workspace.strip() for workspace in entries.splitlines()]
347
+
348
+ def __init_engine(v: str):
349
+ if not parser_state.current_node:
350
+ raise Exception(f"{v} must be called after a NODE command")
351
+ if "engine" not in parser_state.current_node:
352
+ parser_state.current_node["engine"] = {"type": None, "args": []}
353
+
354
+ def set_engine(*args: str, **kwargs: Any) -> None:
355
+ __init_engine("ENGINE")
356
+ engine_type = _unquote((" ".join(args)).strip())
357
+ parser_state.current_node["engine"]["type"] = eval_var(engine_type, skip=skip_eval)
358
+
359
+ def add_engine_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
360
+ def _f(*args: str, **kwargs: Any):
361
+ __init_engine(f"ENGINE_{v}".upper())
362
+ engine_arg = eval_var(_unquote((" ".join(args)).strip()), skip=skip_eval)
363
+ parser_state.current_node["engine"]["args"].append((v, engine_arg))
364
+
365
+ return _f
366
+
367
+ def tags(*args: str, **kwargs: Any) -> None:
368
+ raw_tags = _unquote((" ".join(args)).strip())
369
+ operational_tags, filtering_tags = parse_tags(raw_tags)
370
+
371
+ # Pipe nodes or Data Sources
372
+ if parser_state.current_node and operational_tags:
373
+ operational_tags_args = (operational_tags,)
374
+ assign_node_var("tags")(*operational_tags_args, **kwargs)
375
+
376
+ if filtering_tags:
377
+ if doc.filtering_tags is None:
378
+ doc.filtering_tags = filtering_tags
379
+ else:
380
+ doc.filtering_tags += filtering_tags
381
+
382
+ cmds = {
383
+ "from": assign("from"),
384
+ "source": sources,
385
+ "maintainer": assign("maintainer"),
386
+ "schema": schema,
387
+ "indexes": indexes,
388
+ # TODO: Added to be able to merge MR 11347, let's remove it afterwards
389
+ "indices": indexes,
390
+ "engine": set_engine,
391
+ "partition_key": assign_var("partition_key"),
392
+ "sorting_key": assign_var("sorting_key"),
393
+ "primary_key": assign_var("primary_key"),
394
+ "sampling_key": assign_var("sampling_key"),
395
+ "ttl": assign_var("ttl"),
396
+ "settings": assign_var("settings"),
397
+ "node": node,
398
+ "scope": scope,
399
+ "description": description,
400
+ "type": assign_node_var("type"),
401
+ "datasource": assign_node_var("datasource"),
402
+ "tags": tags,
403
+ "target_datasource": assign_node_var("target_datasource"),
404
+ "copy_schedule": assign_node_var(CopyParameters.COPY_SCHEDULE),
405
+ "copy_mode": assign_node_var("mode"),
406
+ "mode": assign_node_var("mode"),
407
+ "resource": assign_node_var("resource"),
408
+ "filter": assign_node_var("filter"),
409
+ "token": add_token,
410
+ "test": test,
411
+ "include": include,
412
+ "sql": sql("sql"),
413
+ "version": version,
414
+ "kafka_connection_name": assign_var("kafka_connection_name"),
415
+ "kafka_topic": assign_var("kafka_topic"),
416
+ "kafka_group_id": assign_var("kafka_group_id"),
417
+ "kafka_bootstrap_servers": assign_var("kafka_bootstrap_servers"),
418
+ "kafka_key": assign_var("kafka_key"),
419
+ "kafka_secret": assign_var("kafka_secret"),
420
+ "kafka_schema_registry_url": assign_var("kafka_schema_registry_url"),
421
+ "kafka_target_partitions": assign_var("kafka_target_partitions"),
422
+ "kafka_auto_offset_reset": assign_var("kafka_auto_offset_reset"),
423
+ "kafka_store_raw_value": assign_var("kafka_store_raw_value"),
424
+ "kafka_store_headers": assign_var("kafka_store_headers"),
425
+ "kafka_store_binary_headers": assign_var("kafka_store_binary_headers"),
426
+ "kafka_key_avro_deserialization": assign_var("kafka_key_avro_deserialization"),
427
+ "kafka_ssl_ca_pem": assign_var("kafka_ssl_ca_pem"),
428
+ "kafka_sasl_mechanism": assign_var("kafka_sasl_mechanism"),
429
+ "import_service": assign_var("import_service"),
430
+ "import_connection_name": assign_var("import_connection_name"),
431
+ "import_schedule": assign_var("import_schedule"),
432
+ "import_strategy": assign_var("import_strategy"),
433
+ "import_external_datasource": assign_var("import_external_datasource"),
434
+ "import_bucket_uri": assign_var("import_bucket_uri"),
435
+ "import_from_timestamp": assign_var("import_from_timestamp"),
436
+ "import_query": assign_var("import_query"),
437
+ "import_table_arn": assign_var("import_table_arn"),
438
+ "import_export_bucket": assign_var("import_export_bucket"),
439
+ "shared_with": shared_with,
440
+ "export_service": assign_var("export_service"),
441
+ "export_connection_name": assign_var("export_connection_name"),
442
+ "export_schedule": assign_var("export_schedule"),
443
+ "export_bucket_uri": assign_var("export_bucket_uri"),
444
+ "export_file_template": assign_var("export_file_template"),
445
+ "export_format": assign_var("export_format"),
446
+ "export_strategy": assign_var("export_strategy"),
447
+ "export_compression": assign_var("export_compression"),
448
+ "export_kafka_topic": assign_var("export_kafka_topic"),
449
+ }
450
+
451
+ engine_vars = set()
452
+
453
+ for _engine, (params, options) in ENABLED_ENGINES:
454
+ for p in params:
455
+ engine_vars.add(p.name)
456
+ for o in options:
457
+ engine_vars.add(o.name)
458
+ for v in engine_vars:
459
+ cmds[f"engine_{v}"] = add_engine_var(v)
460
+
461
+ if default_node:
462
+ node(default_node)
463
+
464
+ lineno = 0
465
+ try:
466
+ while lineno < len(lines):
467
+ line = lines[lineno]
468
+ try:
469
+ sa = shlex.shlex(line)
470
+ sa.whitespace_split = True
471
+ lexer = list(sa)
472
+ except ValueError:
473
+ sa = shlex.shlex(shlex.quote(line))
474
+ sa.whitespace_split = True
475
+ lexer = list(sa)
476
+ if lexer:
477
+ cmd, args = lexer[0], lexer[1:]
478
+ if (
479
+ parser_state.multiline
480
+ and cmd.lower() in cmds
481
+ and not (line.startswith(" ") or line.startswith("\t") or line.lower().startswith("from"))
482
+ ):
483
+ parser_state.multiline = False
484
+ cmds[parser_state.command](
485
+ parser_state.multiline_string, lineno=lineno, replace_includes=replace_includes
486
+ )
487
+
488
+ if not parser_state.multiline:
489
+ if len(args) >= 1 and args[0] == ">":
490
+ parser_state.multiline = True
491
+ parser_state.command = cmd.lower()
492
+ parser_state.multiline_string = ""
493
+ else:
494
+ if cmd.lower() == "settings":
495
+ raise click.ClickException(FeedbackManager.error_settings_not_allowed())
496
+ if cmd.lower() in cmds:
497
+ cmds[cmd.lower()](*args, lineno=lineno, replace_includes=replace_includes)
498
+ else:
499
+ raise click.ClickException(FeedbackManager.error_option(option=cmd.upper()))
500
+ else:
501
+ parser_state.multiline_string += line
502
+ lineno += 1
503
+ # close final state
504
+ if parser_state.multiline:
505
+ cmds[parser_state.command](parser_state.multiline_string, lineno=lineno, replace_includes=replace_includes)
506
+ except ParseException as e:
507
+ raise ParseException(str(e), lineno=lineno)
508
+ except ValidationException as e:
509
+ raise ValidationException(str(e), lineno=lineno)
510
+ except IndexError as e:
511
+ if "node" in line.lower():
512
+ raise click.ClickException(FeedbackManager.error_missing_node_name())
513
+ elif "sql" in line.lower():
514
+ raise click.ClickException(FeedbackManager.error_missing_sql_command())
515
+ elif "datasource" in line.lower():
516
+ raise click.ClickException(FeedbackManager.error_missing_datasource_name())
517
+ else:
518
+ raise ValidationException(f"Validation error, found {line} in line {str(lineno)}: {str(e)}", lineno=lineno)
519
+ except IncludeFileNotFoundException as e:
520
+ raise IncludeFileNotFoundException(str(e), lineno=lineno)
521
+ except Exception as e:
522
+ traceback.print_tb(e.__traceback__)
523
+ raise ParseException(f"Unexpected error: {e}", lineno=lineno)
524
+
525
+ return doc
526
+
527
+
528
+ class ImportReplacements:
529
+ _REPLACEMENTS: Tuple[Tuple[str, str, Optional[str]], ...] = (
530
+ ("import_service", "service", None),
531
+ ("import_strategy", "mode", "replace"),
532
+ ("import_connection_name", "connection", None),
533
+ ("import_schedule", "cron", ON_DEMAND),
534
+ ("import_query", "query", None),
535
+ ("import_connector", "connector", None),
536
+ ("import_external_datasource", "external_data_source", None),
537
+ ("import_bucket_uri", "bucket_uri", None),
538
+ ("import_from_timestamp", "from_time", None),
539
+ ("import_table_arn", "dynamodb_table_arn", None),
540
+ ("import_export_bucket", "dynamodb_export_bucket", None),
541
+ )
542
+
543
+ @staticmethod
544
+ def get_datafile_parameter_keys() -> List[str]:
545
+ return [x[0] for x in ImportReplacements._REPLACEMENTS]
546
+
547
+ @staticmethod
548
+ def get_api_param_for_datafile_param(connector_service: str, key: str) -> Tuple[Optional[str], Optional[str]]:
549
+ """Returns the API parameter name and default value for a given
550
+ datafile parameter.
551
+ """
552
+ key = key.lower()
553
+ for datafile_k, linker_k, value in ImportReplacements._REPLACEMENTS:
554
+ if datafile_k == key:
555
+ return linker_k, value
556
+ return None, None
557
+
558
+ @staticmethod
559
+ def get_datafile_param_for_linker_param(connector_service: str, linker_param: str) -> Optional[str]:
560
+ """Returns the datafile parameter name for a given linter parameter."""
561
+ linker_param = linker_param.lower()
562
+ for datafile_k, linker_k, _ in ImportReplacements._REPLACEMENTS:
563
+ if linker_k == linker_param:
564
+ return datafile_k
565
+ return None
566
+
567
+ @staticmethod
568
+ def get_datafile_value_for_linker_value(
569
+ connector_service: str, linker_param: str, linker_value: str
570
+ ) -> Optional[str]:
571
+ """Map linker values to datafile values."""
572
+ linker_param = linker_param.lower()
573
+ if linker_param != "cron":
574
+ return linker_value
575
+ if linker_value == "@once":
576
+ return ON_DEMAND
577
+ if connector_service in PREVIEW_CONNECTOR_SERVICES:
578
+ return "@auto"
579
+ return linker_value
580
+
581
+
582
+ class ExportReplacements:
583
+ SERVICES = ("gcs_hmac", "s3", "s3_iamrole", "kafka")
584
+ NODE_TYPES = (PipeNodeTypes.DATA_SINK, PipeNodeTypes.STREAM)
585
+ _REPLACEMENTS = (
586
+ ("export_service", "service", None),
587
+ ("export_connection_name", "connection", None),
588
+ ("export_schedule", "schedule_cron", ""),
589
+ ("export_bucket_uri", "path", None),
590
+ ("export_file_template", "file_template", None),
591
+ ("export_format", "format", "csv"),
592
+ ("export_compression", "compression", None),
593
+ ("export_strategy", "strategy", "@new"),
594
+ ("export_kafka_topic", "kafka_topic", None),
595
+ ("kafka_connection_name", "connection", None),
596
+ ("kafka_topic", "kafka_topic", None),
597
+ )
598
+
599
+ @staticmethod
600
+ def get_export_service(node: Dict[str, Optional[str]]) -> str:
601
+ if (node.get("type", "standard") or "standard").lower() == PipeNodeTypes.STREAM:
602
+ return "kafka"
603
+ return (node.get("export_service", "") or "").lower()
604
+
605
+ @staticmethod
606
+ def get_node_type(node: Dict[str, Optional[str]]) -> str:
607
+ return (node.get("type", "standard") or "standard").lower()
608
+
609
+ @staticmethod
610
+ def is_export_node(node: Dict[str, Optional[str]]) -> bool:
611
+ export_service = ExportReplacements.get_export_service(node)
612
+ node_type = (node.get("type", "standard") or "standard").lower()
613
+ if not export_service:
614
+ return False
615
+ if export_service not in ExportReplacements.SERVICES:
616
+ raise CLIPipeException(f"Invalid export service: {export_service}")
617
+ if node_type not in ExportReplacements.NODE_TYPES:
618
+ raise CLIPipeException(f"Invalid export node type: {node_type}")
619
+ return True
620
+
621
+ @staticmethod
622
+ def get_params_from_datafile(node: Dict[str, Optional[str]]) -> Dict[str, Optional[str]]:
623
+ """Returns the export parameters for a given node."""
624
+ params = {}
625
+ node_type = ExportReplacements.get_node_type(node)
626
+ for datafile_key, export_key, default_value in ExportReplacements._REPLACEMENTS:
627
+ if node_type != PipeNodeTypes.STREAM and datafile_key.startswith("kafka_"):
628
+ continue
629
+ if node_type == PipeNodeTypes.STREAM and datafile_key.startswith("export_"):
630
+ continue
631
+ if datafile_key == "export_schedule" and node.get(datafile_key, None) == ON_DEMAND:
632
+ node[datafile_key] = ""
633
+ params[export_key] = node.get(datafile_key, default_value)
634
+ return params
635
+
636
+ @staticmethod
637
+ def get_datafile_key(param: str, node: Dict[str, Optional[str]]) -> Optional[str]:
638
+ """Returns the datafile key for a given export parameter."""
639
+ node_type = ExportReplacements.get_node_type(node)
640
+ for datafile_key, export_key, _ in ExportReplacements._REPLACEMENTS:
641
+ if node_type != PipeNodeTypes.STREAM and datafile_key.startswith("kafka_"):
642
+ continue
643
+ if node_type == PipeNodeTypes.STREAM and datafile_key.startswith("export_"):
644
+ continue
645
+ if export_key == param.lower():
646
+ return datafile_key.upper()
647
+ return None
648
+
649
+
650
+ def get_project_filenames(folder: str, with_vendor=False) -> List[str]:
651
+ folders: List[str] = [
652
+ f"{folder}/*.datasource",
653
+ f"{folder}/datasources/*.datasource",
654
+ f"{folder}/*.pipe",
655
+ f"{folder}/pipes/*.pipe",
656
+ f"{folder}/endpoints/*.pipe",
657
+ f"{folder}/materializations/*.pipe",
658
+ f"{folder}/sinks/*.pipe",
659
+ f"{folder}/copies/*.pipe",
660
+ f"{folder}/playgrounds/*.pipe",
661
+ ]
662
+ if with_vendor:
663
+ folders.append(f"{folder}/vendor/**/**/*.datasource")
664
+ filenames: List[str] = []
665
+ for x in folders:
666
+ filenames += glob.glob(x)
667
+ return filenames
668
+
669
+
670
+ def has_internal_datafiles(folder: str) -> bool:
671
+ folder = folder or "."
672
+ filenames = get_project_filenames(folder)
673
+ return any([f for f in filenames if "spans" in str(f) and "vendor" not in str(f)])
674
+
675
+
676
+ def peek(iterable):
677
+ try:
678
+ first = next(iterable)
679
+ except Exception:
680
+ return None, None
681
+ return first, itertools.chain([first], iterable)
682
+
683
+
684
+ def normalize_array(items: List[Dict[str, Optional[Any]]]) -> List[Dict]:
685
+ """
686
+ Sorted() doesn't not support values with different types for the same column like None vs str.
687
+ So, we need to cast all None to default value of the type of the column if exist and if all the values are None, we can leave them as None
688
+ >>> normalize_array([{'x': 'hello World'}, {'x': None}])
689
+ [{'x': 'hello World'}, {'x': ''}]
690
+ >>> normalize_array([{'x': 3}, {'x': None}])
691
+ [{'x': 3}, {'x': 0}]
692
+ >>> normalize_array([{'x': {'y': [1,2,3,4]}}, {'x': {'z': "Hello" }}])
693
+ [{'x': {'y': [1, 2, 3, 4]}}, {'x': {'z': 'Hello'}}]
694
+ """
695
+ types: Dict[str, type] = {}
696
+ if len(items) == 0:
697
+ return items
698
+
699
+ columns = items[0].keys()
700
+ for column in columns:
701
+ for object in items:
702
+ if object[column] is not None:
703
+ types[column] = type(object[column])
704
+ break
705
+
706
+ for object in items:
707
+ for column in columns:
708
+ if object[column] is not None:
709
+ continue
710
+
711
+ # If None, we replace it for the default value
712
+ if types.get(column, None):
713
+ object[column] = types[column]()
714
+
715
+ return items
716
+
717
+
718
+ def find_file_by_name(
719
+ folder: str,
720
+ name: str,
721
+ verbose: bool = False,
722
+ is_raw: bool = False,
723
+ workspace_lib_paths: Optional[List[Tuple[str, str]]] = None,
724
+ resource: Optional[Dict] = None,
725
+ ):
726
+ f = Path(folder)
727
+ ds = name + ".datasource"
728
+ if os.path.isfile(os.path.join(folder, ds)):
729
+ return ds, None
730
+ if os.path.isfile(f / "datasources" / ds):
731
+ return ds, None
732
+
733
+ pipe = name + ".pipe"
734
+ if os.path.isfile(os.path.join(folder, pipe)):
735
+ return pipe, None
736
+
737
+ if os.path.isfile(f / "endpoints" / pipe):
738
+ return pipe, None
739
+
740
+ if os.path.isfile(f / "pipes" / pipe):
741
+ return pipe, None
742
+
743
+ token = name + ".token"
744
+ if os.path.isfile(f / "tokens" / token):
745
+ return token, None
746
+
747
+ # look for the file in subdirectories if it's not found in datasources folder
748
+ if workspace_lib_paths:
749
+ _resource = None
750
+ for wk_name, wk_path in workspace_lib_paths:
751
+ file = None
752
+ if name.startswith(f"{wk_name}."):
753
+ file, _resource = find_file_by_name(
754
+ wk_path, name.replace(f"{wk_name}.", ""), verbose, is_raw, resource=resource
755
+ )
756
+ if file:
757
+ return file, _resource
758
+
759
+ if not is_raw:
760
+ f, raw = find_file_by_name(
761
+ folder,
762
+ name,
763
+ verbose=verbose,
764
+ is_raw=True,
765
+ workspace_lib_paths=workspace_lib_paths,
766
+ resource=resource,
767
+ )
768
+ return f, raw
769
+
770
+ # materialized node with DATASOURCE definition
771
+ if resource and "nodes" in resource:
772
+ for node in resource["nodes"]:
773
+ params = node.get("params", {})
774
+ if (
775
+ params.get("type", None) == "materialized"
776
+ and params.get("engine", None)
777
+ and params.get("datasource", None)
778
+ ):
779
+ pipe = resource["resource_name"] + ".pipe"
780
+ pipe_file_exists = (
781
+ os.path.isfile(os.path.join(folder, pipe))
782
+ or os.path.isfile(f / "endpoints" / pipe)
783
+ or os.path.isfile(f / "pipes" / pipe)
784
+ )
785
+ is_target_datasource = params["datasource"] == name
786
+ if pipe_file_exists and is_target_datasource:
787
+ return pipe, {"resource_name": params.get("datasource")}
788
+
789
+ if verbose:
790
+ click.echo(FeedbackManager.warning_file_not_found_inside(name=name, folder=folder))
791
+
792
+ return None, None
793
+
794
+
795
+ def get_name_version(ds: str) -> Dict[str, Any]:
796
+ """
797
+ Given a name like "name__dev__v0" returns ['name', 'dev', 'v0']
798
+ >>> get_name_version('dev__name__v0')
799
+ {'name': 'dev__name', 'version': 0}
800
+ >>> get_name_version('name__v0')
801
+ {'name': 'name', 'version': 0}
802
+ >>> get_name_version('dev__name')
803
+ {'name': 'dev__name', 'version': None}
804
+ >>> get_name_version('name')
805
+ {'name': 'name', 'version': None}
806
+ >>> get_name_version('horario__3__pipe')
807
+ {'name': 'horario__3__pipe', 'version': None}
808
+ >>> get_name_version('horario__checker')
809
+ {'name': 'horario__checker', 'version': None}
810
+ >>> get_name_version('dev__horario__checker')
811
+ {'name': 'dev__horario__checker', 'version': None}
812
+ >>> get_name_version('tg__dActividades__v0_pipe_3907')
813
+ {'name': 'tg__dActividades', 'version': 0}
814
+ >>> get_name_version('tg__dActividades__va_pipe_3907')
815
+ {'name': 'tg__dActividades__va_pipe_3907', 'version': None}
816
+ >>> get_name_version('tg__origin_workspace.shared_ds__v3907')
817
+ {'name': 'tg__origin_workspace.shared_ds', 'version': 3907}
818
+ >>> get_name_version('tmph8egtl__')
819
+ {'name': 'tmph8egtl__', 'version': None}
820
+ >>> get_name_version('tmph8egtl__123__')
821
+ {'name': 'tmph8egtl__123__', 'version': None}
822
+ >>> get_name_version('dev__name__v0')
823
+ {'name': 'dev__name', 'version': 0}
824
+ >>> get_name_version('name__v0')
825
+ {'name': 'name', 'version': 0}
826
+ >>> get_name_version('dev__name')
827
+ {'name': 'dev__name', 'version': None}
828
+ >>> get_name_version('name')
829
+ {'name': 'name', 'version': None}
830
+ >>> get_name_version('horario__3__pipe')
831
+ {'name': 'horario__3__pipe', 'version': None}
832
+ >>> get_name_version('horario__checker')
833
+ {'name': 'horario__checker', 'version': None}
834
+ >>> get_name_version('dev__horario__checker')
835
+ {'name': 'dev__horario__checker', 'version': None}
836
+ >>> get_name_version('tg__dActividades__v0_pipe_3907')
837
+ {'name': 'tg__dActividades', 'version': 0}
838
+ >>> get_name_version('tg__origin_workspace.shared_ds__v3907')
839
+ {'name': 'tg__origin_workspace.shared_ds', 'version': 3907}
840
+ >>> get_name_version('tmph8egtl__')
841
+ {'name': 'tmph8egtl__', 'version': None}
842
+ >>> get_name_version('tmph8egtl__123__')
843
+ {'name': 'tmph8egtl__123__', 'version': None}
844
+ """
845
+ tk = ds.rsplit("__", 2)
846
+ if len(tk) == 1:
847
+ return {"name": tk[0], "version": None}
848
+ elif len(tk) == 2:
849
+ if len(tk[1]):
850
+ if tk[1][0] == "v" and re.match("[0-9]+$", tk[1][1:]):
851
+ return {"name": tk[0], "version": int(tk[1][1:])}
852
+ else:
853
+ return {"name": tk[0] + "__" + tk[1], "version": None}
854
+ elif len(tk) == 3 and len(tk[2]):
855
+ if tk[2] == "checker":
856
+ return {"name": tk[0] + "__" + tk[1] + "__" + tk[2], "version": None}
857
+ if tk[2][0] == "v":
858
+ parts = tk[2].split("_")
859
+ try:
860
+ return {"name": tk[0] + "__" + tk[1], "version": int(parts[0][1:])}
861
+ except ValueError:
862
+ return {"name": tk[0] + "__" + tk[1] + "__" + tk[2], "version": None}
863
+ else:
864
+ return {"name": "__".join(tk[0:]), "version": None}
865
+
866
+ return {"name": ds, "version": None}
867
+
868
+
869
+ def get_resource_versions(datasources: List[str]):
870
+ """
871
+ return the latest version for all the datasources
872
+ """
873
+ versions = {}
874
+ for x in datasources:
875
+ t = get_name_version(x)
876
+ name = t["name"]
877
+ if t.get("version", None) is not None:
878
+ versions[name] = t["version"]
879
+ return versions
880
+
881
+
882
+ def is_file_a_datasource(filename: str) -> bool:
883
+ extensions = Path(filename).suffixes
884
+ if ".datasource" in extensions: # Accepts '.datasource' and '.datasource.incl'
885
+ return True
886
+
887
+ if ".incl" in extensions:
888
+ lines = []
889
+ with open(filename) as file:
890
+ lines = file.readlines()
891
+
892
+ for line in lines:
893
+ trimmed_line = line.strip().lower()
894
+ if trimmed_line.startswith("schema") or trimmed_line.startswith("engine"):
895
+ return True
896
+
897
+ return False