tinybird 0.0.1.dev6__py3-none-any.whl → 0.0.1.dev8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tinybird might be problematic. Click here for more details.

Files changed (31) hide show
  1. tinybird/tb/modules/branch.py +0 -21
  2. tinybird/tb/modules/build.py +7 -18
  3. tinybird/tb/modules/cli.py +11 -131
  4. tinybird/tb/modules/common.py +14 -2
  5. tinybird/tb/modules/create.py +10 -14
  6. tinybird/tb/modules/datafile/build.py +2136 -0
  7. tinybird/tb/modules/datafile/build_common.py +118 -0
  8. tinybird/tb/modules/datafile/build_datasource.py +413 -0
  9. tinybird/tb/modules/datafile/build_pipe.py +648 -0
  10. tinybird/tb/modules/datafile/common.py +898 -0
  11. tinybird/tb/modules/datafile/diff.py +197 -0
  12. tinybird/tb/modules/datafile/exceptions.py +23 -0
  13. tinybird/tb/modules/datafile/format_common.py +66 -0
  14. tinybird/tb/modules/datafile/format_datasource.py +160 -0
  15. tinybird/tb/modules/datafile/format_pipe.py +195 -0
  16. tinybird/tb/modules/datafile/parse_datasource.py +41 -0
  17. tinybird/tb/modules/datafile/parse_pipe.py +69 -0
  18. tinybird/tb/modules/datafile/pipe_checker.py +560 -0
  19. tinybird/tb/modules/datafile/pull.py +157 -0
  20. tinybird/tb/modules/datasource.py +1 -1
  21. tinybird/tb/modules/fmt.py +4 -1
  22. tinybird/tb/modules/local.py +3 -0
  23. tinybird/tb/modules/pipe.py +8 -2
  24. tinybird/tb/modules/prompts.py +1 -1
  25. tinybird/tb/modules/workspace.py +1 -1
  26. {tinybird-0.0.1.dev6.dist-info → tinybird-0.0.1.dev8.dist-info}/METADATA +1 -1
  27. {tinybird-0.0.1.dev6.dist-info → tinybird-0.0.1.dev8.dist-info}/RECORD +30 -17
  28. tinybird/tb/modules/datafile.py +0 -6122
  29. {tinybird-0.0.1.dev6.dist-info → tinybird-0.0.1.dev8.dist-info}/WHEEL +0 -0
  30. {tinybird-0.0.1.dev6.dist-info → tinybird-0.0.1.dev8.dist-info}/entry_points.txt +0 -0
  31. {tinybird-0.0.1.dev6.dist-info → tinybird-0.0.1.dev8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,898 @@
1
+ import glob
2
+ import itertools
3
+ import os
4
+ import os.path
5
+ import pprint
6
+ import re
7
+ import shlex
8
+ import textwrap
9
+ import traceback
10
+ from collections import namedtuple
11
+ from io import StringIO
12
+ from pathlib import Path
13
+ from string import Template
14
+ from typing import Any, Callable, Dict, List, Optional, Tuple, cast
15
+
16
+ import click
17
+ from mypy_extensions import KwArg, VarArg
18
+
19
+ from tinybird.ch_utils.engine import ENABLED_ENGINES
20
+ from tinybird.feedback_manager import FeedbackManager
21
+ from tinybird.sql import parse_indexes_structure, parse_table_structure, schema_to_sql_columns
22
+ from tinybird.tb.modules.datafile.exceptions import IncludeFileNotFoundException, ParseException, ValidationException
23
+ from tinybird.tb.modules.exceptions import CLIPipeException
24
+
25
+
26
+ class PipeTypes:
27
+ MATERIALIZED = "materialized"
28
+ ENDPOINT = "endpoint"
29
+ COPY = "copy"
30
+ DATA_SINK = "sink"
31
+ STREAM = "stream"
32
+ DEFAULT = "default"
33
+
34
+
35
+ class PipeNodeTypes:
36
+ MATERIALIZED = "materialized"
37
+ ENDPOINT = "endpoint"
38
+ STANDARD = "standard"
39
+ DEFAULT = "default"
40
+ DATA_SINK = "sink"
41
+ COPY = "copy"
42
+ STREAM = "stream"
43
+
44
+
45
+ class DataFileExtensions:
46
+ PIPE = ".pipe"
47
+ DATASOURCE = ".datasource"
48
+ INCL = ".incl"
49
+
50
+
51
+ class CopyModes:
52
+ APPEND = "append"
53
+ REPLACE = "replace"
54
+
55
+ valid_modes = (APPEND, REPLACE)
56
+
57
+ @staticmethod
58
+ def is_valid(node_mode):
59
+ return node_mode.lower() in CopyModes.valid_modes
60
+
61
+
62
+ class CopyParameters:
63
+ TARGET_DATASOURCE = "target_datasource"
64
+ COPY_SCHEDULE = "copy_schedule"
65
+ COPY_MODE = "copy_mode"
66
+
67
+
68
+ DATAFILE_NEW_LINE = "\n"
69
+ DATAFILE_INDENT = " " * 4
70
+
71
+ ON_DEMAND = "@on-demand"
72
+ DEFAULT_CRON_PERIOD: int = 60
73
+
74
+ INTERNAL_TABLES: Tuple[str, ...] = (
75
+ "datasources_ops_log",
76
+ "pipe_stats",
77
+ "pipe_stats_rt",
78
+ "block_log",
79
+ "data_connectors_log",
80
+ "kafka_ops_log",
81
+ "datasources_storage",
82
+ "endpoint_errors",
83
+ "bi_stats_rt",
84
+ "bi_stats",
85
+ )
86
+
87
+ PREVIEW_CONNECTOR_SERVICES = ["s3", "s3_iamrole", "gcs"]
88
+ TB_LOCAL_WORKSPACE_NAME = "Tinybird_Local_Testing"
89
+
90
+ pp = pprint.PrettyPrinter()
91
+
92
+
93
+ class Datafile:
94
+ def __init__(self) -> None:
95
+ self.maintainer: Optional[str] = None
96
+ self.sources: List[str] = []
97
+ self.nodes: List[Dict[str, Any]] = []
98
+ self.tokens: List[Dict[str, Any]] = []
99
+ self.version: Optional[int] = None
100
+ self.description: Optional[str] = None
101
+ self.raw: Optional[List[str]] = None
102
+ self.includes: Dict[str, Any] = {}
103
+ self.shared_with: List[str] = []
104
+ self.warnings: List[str] = []
105
+ self.filtering_tags: Optional[List[str]] = None
106
+
107
+ def validate(self) -> None:
108
+ for x in self.nodes:
109
+ if not x["name"].strip():
110
+ raise ValidationException("invalid node name, can't be empty")
111
+ if "sql" not in x:
112
+ raise ValidationException("node %s must have a SQL query" % x["name"])
113
+ if self.version is not None and (not isinstance(self.version, int) or self.version < 0):
114
+ raise ValidationException("version must be a positive integer")
115
+
116
+ def is_equal(self, other):
117
+ if len(self.nodes) != len(other.nodes):
118
+ return False
119
+
120
+ return all(self.nodes[i] == other.nodes[i] for i, _ in enumerate(self.nodes))
121
+
122
+
123
+ def format_filename(filename: str, hide_folders: bool = False):
124
+ return os.path.basename(filename) if hide_folders else filename
125
+
126
+
127
+ def _unquote(x: str):
128
+ QUOTES = ('"', "'")
129
+ if x[0] in QUOTES and x[-1] in QUOTES:
130
+ x = x[1:-1]
131
+ return x
132
+
133
+
134
+ def eval_var(s: str, skip: bool = False) -> str:
135
+ if skip:
136
+ return s
137
+ # replace ENV variables
138
+ # it's probably a bad idea to allow to get any env var
139
+ return Template(s).safe_substitute(os.environ)
140
+
141
+
142
+ def parse_tags(tags: str) -> Tuple[str, List[str]]:
143
+ """
144
+ Parses a string of tags into:
145
+ - kv_tags: a string of key-value tags: the previous tags we have for operational purposes. It
146
+ has the format key=value&key2=value2 (with_staging=true&with_last_date=true)
147
+ - filtering_tags: a list of tags that are used for filtering.
148
+
149
+ Example: "with_staging=true&with_last_date=true,billing,stats" ->
150
+ kv_tags = {"with_staging": "true", "with_last_date": "true"}
151
+ filtering_tags = ["billing", "stats"]
152
+ """
153
+ kv_tags = []
154
+ filtering_tags = []
155
+
156
+ entries = tags.split(",")
157
+ for entry in entries:
158
+ trimmed_entry = entry.strip()
159
+ if "=" in trimmed_entry:
160
+ kv_tags.append(trimmed_entry)
161
+ else:
162
+ filtering_tags.append(trimmed_entry)
163
+
164
+ all_kv_tags = "&".join(kv_tags)
165
+
166
+ return all_kv_tags, filtering_tags
167
+
168
+
169
+ def parse(
170
+ s: str,
171
+ default_node: Optional[str] = None,
172
+ basepath: str = ".",
173
+ replace_includes: bool = True,
174
+ skip_eval: bool = False,
175
+ ) -> Datafile:
176
+ """
177
+ Parses `s` string into a document
178
+ >>> d = parse("FROM SCRATCH\\nSOURCE 'https://example.com'\\n#this is a comment\\nMAINTAINER 'rambo' #this is me\\nNODE \\"test_01\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_00\\n\\n\\nNODE \\"test_02\\"\\n DESCRIPTION this is a node that does whatever\\nSQL >\\n\\n SELECT * from test_01\\n WHERE a > 1\\n GROUP by a\\n")
179
+ >>> d.maintainer
180
+ 'rambo'
181
+ >>> d.sources
182
+ ['https://example.com']
183
+ >>> len(d.nodes)
184
+ 2
185
+ >>> d.nodes[0]
186
+ {'name': 'test_01', 'description': 'this is a node that does whatever', 'sql': 'SELECT * from test_00'}
187
+ >>> d.nodes[1]
188
+ {'name': 'test_02', 'description': 'this is a node that does whatever', 'sql': 'SELECT * from test_01\\nWHERE a > 1\\nGROUP by a'}
189
+ """
190
+ lines = list(StringIO(s, newline=None))
191
+
192
+ doc = Datafile()
193
+ doc.raw = list(StringIO(s, newline=None))
194
+
195
+ parser_state = namedtuple("parser_state", ["multiline", "current_node", "command", "multiline_string", "is_sql"])
196
+
197
+ parser_state.multiline = False
198
+ parser_state.current_node = False
199
+
200
+ def assign(attr):
201
+ def _fn(x, **kwargs):
202
+ setattr(doc, attr, _unquote(x))
203
+
204
+ return _fn
205
+
206
+ def schema(*args, **kwargs):
207
+ s = _unquote("".join(args))
208
+ try:
209
+ sh = parse_table_structure(s)
210
+ except Exception as e:
211
+ raise ParseException(FeedbackManager.error_parsing_schema(line=kwargs["lineno"], error=e))
212
+
213
+ parser_state.current_node["schema"] = ",".join(schema_to_sql_columns(sh))
214
+ parser_state.current_node["columns"] = sh
215
+
216
+ def indexes(*args, **kwargs):
217
+ s = _unquote("".join(args))
218
+ if not s:
219
+ return
220
+ try:
221
+ indexes = parse_indexes_structure(s.splitlines())
222
+ except Exception as e:
223
+ raise ParseException(FeedbackManager.error_parsing_indices(line=kwargs["lineno"], error=e))
224
+
225
+ parser_state.current_node["indexes"] = indexes
226
+
227
+ def assign_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
228
+ def _f(*args: str, **kwargs: Any):
229
+ s = _unquote((" ".join(args)).strip())
230
+ parser_state.current_node[v.lower()] = eval_var(s, skip=skip_eval)
231
+
232
+ return _f
233
+
234
+ def sources(x: str, **kwargs: Any) -> None:
235
+ doc.sources.append(_unquote(x))
236
+
237
+ def node(*args: str, **kwargs: Any) -> None:
238
+ node = {"name": eval_var(_unquote(args[0]))}
239
+ doc.nodes.append(node)
240
+ parser_state.current_node = node
241
+
242
+ def scope(*args: str, **kwargs: Any) -> None:
243
+ scope = {"name": eval_var(_unquote(args[0]))}
244
+ doc.nodes.append(scope)
245
+ parser_state.current_node = scope
246
+
247
+ def description(*args: str, **kwargs: Any) -> None:
248
+ description = (" ".join(args)).strip()
249
+
250
+ if parser_state.current_node:
251
+ parser_state.current_node["description"] = description
252
+ if parser_state.current_node.get("name", "") == "default":
253
+ doc.description = description
254
+ else:
255
+ doc.description = description
256
+
257
+ def sql(var_name: str, **kwargs: Any) -> Callable[[str, KwArg(Any)], None]:
258
+ def _f(sql: str, **kwargs: Any) -> None:
259
+ if not parser_state.current_node:
260
+ raise ParseException("SQL must be called after a NODE command")
261
+ parser_state.current_node[var_name] = (
262
+ textwrap.dedent(sql).rstrip() if "%" not in sql.strip()[0] else sql.strip()
263
+ )
264
+
265
+ # HACK this cast is needed because Mypy
266
+ return cast(Callable[[str, KwArg(Any)], None], _f)
267
+
268
+ def assign_node_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
269
+ def _f(*args: str, **kwargs: Any) -> None:
270
+ if not parser_state.current_node:
271
+ raise ParseException("%s must be called after a NODE command" % v)
272
+ return assign_var(v)(*args, **kwargs)
273
+
274
+ return _f
275
+
276
+ def add_token(*args: str, **kwargs: Any) -> None: # token_name, permissions):
277
+ if len(args) < 2:
278
+ raise ParseException('TOKEN gets two params, token name and permissions e.g TOKEN "read api token" READ')
279
+ doc.tokens.append({"token_name": _unquote(args[0]), "permissions": args[1]})
280
+
281
+ def test(*args: str, **kwargs: Any) -> None:
282
+ # TODO: Should be removed?
283
+ print("test", args, kwargs) # noqa: T201
284
+
285
+ def include(*args: str, **kwargs: Any) -> None:
286
+ f = _unquote(args[0])
287
+ f = eval_var(f)
288
+ attrs = dict(_unquote(x).split("=", 1) for x in args[1:])
289
+ nonlocal lines
290
+ lineno = kwargs["lineno"]
291
+ replace_includes = kwargs["replace_includes"]
292
+ n = lineno
293
+ args_with_attrs = " ".join(args)
294
+
295
+ try:
296
+ while True:
297
+ n += 1
298
+ if len(lines) <= n:
299
+ break
300
+ if "NODE" in lines[n]:
301
+ doc.includes[args_with_attrs] = lines[n]
302
+ break
303
+ if args_with_attrs not in doc.includes:
304
+ doc.includes[args_with_attrs] = ""
305
+ except Exception:
306
+ pass
307
+
308
+ # If this parse was triggered by format, we don't want to replace the file
309
+ if not replace_includes:
310
+ return
311
+
312
+ # be sure to replace the include line
313
+ p = Path(basepath)
314
+
315
+ try:
316
+ with open(p / f) as file:
317
+ try:
318
+ ll = list(StringIO(file.read(), newline=None))
319
+ node_line = [line for line in ll if "NODE" in line]
320
+ if node_line and doc.includes[args_with_attrs]:
321
+ doc.includes[node_line[0].split("NODE")[-1].split("\n")[0].strip()] = ""
322
+ except Exception:
323
+ pass
324
+ finally:
325
+ file.seek(0)
326
+ lines[lineno : lineno + 1] = [
327
+ "",
328
+ *list(StringIO(Template(file.read()).safe_substitute(attrs), newline=None)),
329
+ ]
330
+ except FileNotFoundError:
331
+ raise IncludeFileNotFoundException(f, lineno)
332
+
333
+ def version(*args: str, **kwargs: Any) -> None:
334
+ if len(args) < 1:
335
+ raise ParseException("VERSION gets one positive integer param")
336
+ try:
337
+ version = int(args[0])
338
+ if version < 0:
339
+ raise ValidationException("version must be a positive integer e.g VERSION 2")
340
+ doc.version = version
341
+ except ValueError:
342
+ raise ValidationException("version must be a positive integer e.g VERSION 2")
343
+
344
+ def shared_with(*args: str, **kwargs: Any) -> None:
345
+ for entries in args:
346
+ # In case they specify multiple workspaces
347
+ doc.shared_with += [workspace.strip() for workspace in entries.splitlines()]
348
+
349
+ def __init_engine(v: str):
350
+ if not parser_state.current_node:
351
+ raise Exception(f"{v} must be called after a NODE command")
352
+ if "engine" not in parser_state.current_node:
353
+ parser_state.current_node["engine"] = {"type": None, "args": []}
354
+
355
+ def set_engine(*args: str, **kwargs: Any) -> None:
356
+ __init_engine("ENGINE")
357
+ engine_type = _unquote((" ".join(args)).strip())
358
+ parser_state.current_node["engine"]["type"] = eval_var(engine_type, skip=skip_eval)
359
+
360
+ def add_engine_var(v: str) -> Callable[[VarArg(str), KwArg(Any)], None]:
361
+ def _f(*args: str, **kwargs: Any):
362
+ __init_engine(f"ENGINE_{v}".upper())
363
+ engine_arg = eval_var(_unquote((" ".join(args)).strip()), skip=skip_eval)
364
+ parser_state.current_node["engine"]["args"].append((v, engine_arg))
365
+
366
+ return _f
367
+
368
+ def tags(*args: str, **kwargs: Any) -> None:
369
+ raw_tags = _unquote((" ".join(args)).strip())
370
+ operational_tags, filtering_tags = parse_tags(raw_tags)
371
+
372
+ # Pipe nodes or Data Sources
373
+ if parser_state.current_node and operational_tags:
374
+ operational_tags_args = (operational_tags,)
375
+ assign_node_var("tags")(*operational_tags_args, **kwargs)
376
+
377
+ if filtering_tags:
378
+ if doc.filtering_tags is None:
379
+ doc.filtering_tags = filtering_tags
380
+ else:
381
+ doc.filtering_tags += filtering_tags
382
+
383
+ cmds = {
384
+ "from": assign("from"),
385
+ "source": sources,
386
+ "maintainer": assign("maintainer"),
387
+ "schema": schema,
388
+ "indexes": indexes,
389
+ # TODO: Added to be able to merge MR 11347, let's remove it afterwards
390
+ "indices": indexes,
391
+ "engine": set_engine,
392
+ "partition_key": assign_var("partition_key"),
393
+ "sorting_key": assign_var("sorting_key"),
394
+ "primary_key": assign_var("primary_key"),
395
+ "sampling_key": assign_var("sampling_key"),
396
+ "ttl": assign_var("ttl"),
397
+ "settings": assign_var("settings"),
398
+ "node": node,
399
+ "scope": scope,
400
+ "description": description,
401
+ "type": assign_node_var("type"),
402
+ "datasource": assign_node_var("datasource"),
403
+ "tags": tags,
404
+ "target_datasource": assign_node_var("target_datasource"),
405
+ "copy_schedule": assign_node_var(CopyParameters.COPY_SCHEDULE),
406
+ "copy_mode": assign_node_var("mode"),
407
+ "mode": assign_node_var("mode"),
408
+ "resource": assign_node_var("resource"),
409
+ "filter": assign_node_var("filter"),
410
+ "token": add_token,
411
+ "test": test,
412
+ "include": include,
413
+ "sql": sql("sql"),
414
+ "version": version,
415
+ "kafka_connection_name": assign_var("kafka_connection_name"),
416
+ "kafka_topic": assign_var("kafka_topic"),
417
+ "kafka_group_id": assign_var("kafka_group_id"),
418
+ "kafka_bootstrap_servers": assign_var("kafka_bootstrap_servers"),
419
+ "kafka_key": assign_var("kafka_key"),
420
+ "kafka_secret": assign_var("kafka_secret"),
421
+ "kafka_schema_registry_url": assign_var("kafka_schema_registry_url"),
422
+ "kafka_target_partitions": assign_var("kafka_target_partitions"),
423
+ "kafka_auto_offset_reset": assign_var("kafka_auto_offset_reset"),
424
+ "kafka_store_raw_value": assign_var("kafka_store_raw_value"),
425
+ "kafka_store_headers": assign_var("kafka_store_headers"),
426
+ "kafka_store_binary_headers": assign_var("kafka_store_binary_headers"),
427
+ "kafka_key_avro_deserialization": assign_var("kafka_key_avro_deserialization"),
428
+ "kafka_ssl_ca_pem": assign_var("kafka_ssl_ca_pem"),
429
+ "kafka_sasl_mechanism": assign_var("kafka_sasl_mechanism"),
430
+ "import_service": assign_var("import_service"),
431
+ "import_connection_name": assign_var("import_connection_name"),
432
+ "import_schedule": assign_var("import_schedule"),
433
+ "import_strategy": assign_var("import_strategy"),
434
+ "import_external_datasource": assign_var("import_external_datasource"),
435
+ "import_bucket_uri": assign_var("import_bucket_uri"),
436
+ "import_from_timestamp": assign_var("import_from_timestamp"),
437
+ "import_query": assign_var("import_query"),
438
+ "import_table_arn": assign_var("import_table_arn"),
439
+ "import_export_bucket": assign_var("import_export_bucket"),
440
+ "shared_with": shared_with,
441
+ "export_service": assign_var("export_service"),
442
+ "export_connection_name": assign_var("export_connection_name"),
443
+ "export_schedule": assign_var("export_schedule"),
444
+ "export_bucket_uri": assign_var("export_bucket_uri"),
445
+ "export_file_template": assign_var("export_file_template"),
446
+ "export_format": assign_var("export_format"),
447
+ "export_strategy": assign_var("export_strategy"),
448
+ "export_compression": assign_var("export_compression"),
449
+ "export_kafka_topic": assign_var("export_kafka_topic"),
450
+ }
451
+
452
+ engine_vars = set()
453
+
454
+ for _engine, (params, options) in ENABLED_ENGINES:
455
+ for p in params:
456
+ engine_vars.add(p.name)
457
+ for o in options:
458
+ engine_vars.add(o.name)
459
+ for v in engine_vars:
460
+ cmds[f"engine_{v}"] = add_engine_var(v)
461
+
462
+ if default_node:
463
+ node(default_node)
464
+
465
+ lineno = 0
466
+ try:
467
+ while lineno < len(lines):
468
+ line = lines[lineno]
469
+ try:
470
+ sa = shlex.shlex(line)
471
+ sa.whitespace_split = True
472
+ lexer = list(sa)
473
+ except ValueError:
474
+ sa = shlex.shlex(shlex.quote(line))
475
+ sa.whitespace_split = True
476
+ lexer = list(sa)
477
+ if lexer:
478
+ cmd, args = lexer[0], lexer[1:]
479
+ if (
480
+ parser_state.multiline
481
+ and cmd.lower() in cmds
482
+ and not (line.startswith(" ") or line.startswith("\t") or line.lower().startswith("from"))
483
+ ):
484
+ parser_state.multiline = False
485
+ cmds[parser_state.command](
486
+ parser_state.multiline_string, lineno=lineno, replace_includes=replace_includes
487
+ )
488
+
489
+ if not parser_state.multiline:
490
+ if len(args) >= 1 and args[0] == ">":
491
+ parser_state.multiline = True
492
+ parser_state.command = cmd.lower()
493
+ parser_state.multiline_string = ""
494
+ else:
495
+ if cmd.lower() == "settings":
496
+ raise click.ClickException(FeedbackManager.error_settings_not_allowed())
497
+ if cmd.lower() in cmds:
498
+ cmds[cmd.lower()](*args, lineno=lineno, replace_includes=replace_includes)
499
+ else:
500
+ raise click.ClickException(FeedbackManager.error_option(option=cmd.upper()))
501
+ else:
502
+ parser_state.multiline_string += line
503
+ lineno += 1
504
+ # close final state
505
+ if parser_state.multiline:
506
+ cmds[parser_state.command](parser_state.multiline_string, lineno=lineno, replace_includes=replace_includes)
507
+ except ParseException as e:
508
+ raise ParseException(str(e), lineno=lineno)
509
+ except ValidationException as e:
510
+ raise ValidationException(str(e), lineno=lineno)
511
+ except IndexError as e:
512
+ if "node" in line.lower():
513
+ raise click.ClickException(FeedbackManager.error_missing_node_name())
514
+ elif "sql" in line.lower():
515
+ raise click.ClickException(FeedbackManager.error_missing_sql_command())
516
+ elif "datasource" in line.lower():
517
+ raise click.ClickException(FeedbackManager.error_missing_datasource_name())
518
+ else:
519
+ raise ValidationException(f"Validation error, found {line} in line {str(lineno)}: {str(e)}", lineno=lineno)
520
+ except IncludeFileNotFoundException as e:
521
+ raise IncludeFileNotFoundException(str(e), lineno=lineno)
522
+ except Exception as e:
523
+ traceback.print_tb(e.__traceback__)
524
+ raise ParseException(f"Unexpected error: {e}", lineno=lineno)
525
+
526
+ return doc
527
+
528
+
529
+ class ImportReplacements:
530
+ _REPLACEMENTS: Tuple[Tuple[str, str, Optional[str]], ...] = (
531
+ ("import_service", "service", None),
532
+ ("import_strategy", "mode", "replace"),
533
+ ("import_connection_name", "connection", None),
534
+ ("import_schedule", "cron", ON_DEMAND),
535
+ ("import_query", "query", None),
536
+ ("import_connector", "connector", None),
537
+ ("import_external_datasource", "external_data_source", None),
538
+ ("import_bucket_uri", "bucket_uri", None),
539
+ ("import_from_timestamp", "from_time", None),
540
+ ("import_table_arn", "dynamodb_table_arn", None),
541
+ ("import_export_bucket", "dynamodb_export_bucket", None),
542
+ )
543
+
544
+ @staticmethod
545
+ def get_datafile_parameter_keys() -> List[str]:
546
+ return [x[0] for x in ImportReplacements._REPLACEMENTS]
547
+
548
+ @staticmethod
549
+ def get_api_param_for_datafile_param(connector_service: str, key: str) -> Tuple[Optional[str], Optional[str]]:
550
+ """Returns the API parameter name and default value for a given
551
+ datafile parameter.
552
+ """
553
+ key = key.lower()
554
+ for datafile_k, linker_k, value in ImportReplacements._REPLACEMENTS:
555
+ if datafile_k == key:
556
+ return linker_k, value
557
+ return None, None
558
+
559
+ @staticmethod
560
+ def get_datafile_param_for_linker_param(connector_service: str, linker_param: str) -> Optional[str]:
561
+ """Returns the datafile parameter name for a given linter parameter."""
562
+ linker_param = linker_param.lower()
563
+ for datafile_k, linker_k, _ in ImportReplacements._REPLACEMENTS:
564
+ if linker_k == linker_param:
565
+ return datafile_k
566
+ return None
567
+
568
+ @staticmethod
569
+ def get_datafile_value_for_linker_value(
570
+ connector_service: str, linker_param: str, linker_value: str
571
+ ) -> Optional[str]:
572
+ """Map linker values to datafile values."""
573
+ linker_param = linker_param.lower()
574
+ if linker_param != "cron":
575
+ return linker_value
576
+ if linker_value == "@once":
577
+ return ON_DEMAND
578
+ if connector_service in PREVIEW_CONNECTOR_SERVICES:
579
+ return "@auto"
580
+ return linker_value
581
+
582
+
583
+ class ExportReplacements:
584
+ SERVICES = ("gcs_hmac", "s3", "s3_iamrole", "kafka")
585
+ NODE_TYPES = (PipeNodeTypes.DATA_SINK, PipeNodeTypes.STREAM)
586
+ _REPLACEMENTS = (
587
+ ("export_service", "service", None),
588
+ ("export_connection_name", "connection", None),
589
+ ("export_schedule", "schedule_cron", ""),
590
+ ("export_bucket_uri", "path", None),
591
+ ("export_file_template", "file_template", None),
592
+ ("export_format", "format", "csv"),
593
+ ("export_compression", "compression", None),
594
+ ("export_strategy", "strategy", "@new"),
595
+ ("export_kafka_topic", "kafka_topic", None),
596
+ ("kafka_connection_name", "connection", None),
597
+ ("kafka_topic", "kafka_topic", None),
598
+ )
599
+
600
+ @staticmethod
601
+ def get_export_service(node: Dict[str, Optional[str]]) -> str:
602
+ if (node.get("type", "standard") or "standard").lower() == PipeNodeTypes.STREAM:
603
+ return "kafka"
604
+ return (node.get("export_service", "") or "").lower()
605
+
606
+ @staticmethod
607
+ def get_node_type(node: Dict[str, Optional[str]]) -> str:
608
+ return (node.get("type", "standard") or "standard").lower()
609
+
610
+ @staticmethod
611
+ def is_export_node(node: Dict[str, Optional[str]]) -> bool:
612
+ export_service = ExportReplacements.get_export_service(node)
613
+ node_type = (node.get("type", "standard") or "standard").lower()
614
+ if not export_service:
615
+ return False
616
+ if export_service not in ExportReplacements.SERVICES:
617
+ raise CLIPipeException(f"Invalid export service: {export_service}")
618
+ if node_type not in ExportReplacements.NODE_TYPES:
619
+ raise CLIPipeException(f"Invalid export node type: {node_type}")
620
+ return True
621
+
622
+ @staticmethod
623
+ def get_params_from_datafile(node: Dict[str, Optional[str]]) -> Dict[str, Optional[str]]:
624
+ """Returns the export parameters for a given node."""
625
+ params = {}
626
+ node_type = ExportReplacements.get_node_type(node)
627
+ for datafile_key, export_key, default_value in ExportReplacements._REPLACEMENTS:
628
+ if node_type != PipeNodeTypes.STREAM and datafile_key.startswith("kafka_"):
629
+ continue
630
+ if node_type == PipeNodeTypes.STREAM and datafile_key.startswith("export_"):
631
+ continue
632
+ if datafile_key == "export_schedule" and node.get(datafile_key, None) == ON_DEMAND:
633
+ node[datafile_key] = ""
634
+ params[export_key] = node.get(datafile_key, default_value)
635
+ return params
636
+
637
+ @staticmethod
638
+ def get_datafile_key(param: str, node: Dict[str, Optional[str]]) -> Optional[str]:
639
+ """Returns the datafile key for a given export parameter."""
640
+ node_type = ExportReplacements.get_node_type(node)
641
+ for datafile_key, export_key, _ in ExportReplacements._REPLACEMENTS:
642
+ if node_type != PipeNodeTypes.STREAM and datafile_key.startswith("kafka_"):
643
+ continue
644
+ if node_type == PipeNodeTypes.STREAM and datafile_key.startswith("export_"):
645
+ continue
646
+ if export_key == param.lower():
647
+ return datafile_key.upper()
648
+ return None
649
+
650
+
651
+ def get_project_filenames(folder: str, with_vendor=False) -> List[str]:
652
+ folders: List[str] = [
653
+ f"{folder}/*.datasource",
654
+ f"{folder}/datasources/*.datasource",
655
+ f"{folder}/*.pipe",
656
+ f"{folder}/pipes/*.pipe",
657
+ f"{folder}/endpoints/*.pipe",
658
+ f"{folder}/materializations/*.pipe",
659
+ f"{folder}/sinks/*.pipe",
660
+ f"{folder}/copies/*.pipe",
661
+ f"{folder}/playgrounds/*.pipe",
662
+ ]
663
+ if with_vendor:
664
+ folders.append(f"{folder}/vendor/**/**/*.datasource")
665
+ filenames: List[str] = []
666
+ for x in folders:
667
+ filenames += glob.glob(x)
668
+ return filenames
669
+
670
+
671
+ def has_internal_datafiles(folder: str) -> bool:
672
+ folder = folder or "."
673
+ filenames = get_project_filenames(folder)
674
+ return any([f for f in filenames if "spans" in str(f) and "vendor" not in str(f)])
675
+
676
+
677
+ def peek(iterable):
678
+ try:
679
+ first = next(iterable)
680
+ except Exception:
681
+ return None, None
682
+ return first, itertools.chain([first], iterable)
683
+
684
+
685
+ def normalize_array(items: List[Dict[str, Optional[Any]]]) -> List[Dict]:
686
+ """
687
+ Sorted() doesn't not support values with different types for the same column like None vs str.
688
+ So, we need to cast all None to default value of the type of the column if exist and if all the values are None, we can leave them as None
689
+ >>> normalize_array([{'x': 'hello World'}, {'x': None}])
690
+ [{'x': 'hello World'}, {'x': ''}]
691
+ >>> normalize_array([{'x': 3}, {'x': None}])
692
+ [{'x': 3}, {'x': 0}]
693
+ >>> normalize_array([{'x': {'y': [1,2,3,4]}}, {'x': {'z': "Hello" }}])
694
+ [{'x': {'y': [1, 2, 3, 4]}}, {'x': {'z': 'Hello'}}]
695
+ """
696
+ types: Dict[str, type] = {}
697
+ if len(items) == 0:
698
+ return items
699
+
700
+ columns = items[0].keys()
701
+ for column in columns:
702
+ for object in items:
703
+ if object[column] is not None:
704
+ types[column] = type(object[column])
705
+ break
706
+
707
+ for object in items:
708
+ for column in columns:
709
+ if object[column] is not None:
710
+ continue
711
+
712
+ # If None, we replace it for the default value
713
+ if types.get(column, None):
714
+ object[column] = types[column]()
715
+
716
+ return items
717
+
718
+
719
+ def find_file_by_name(
720
+ folder: str,
721
+ name: str,
722
+ verbose: bool = False,
723
+ is_raw: bool = False,
724
+ workspace_lib_paths: Optional[List[Tuple[str, str]]] = None,
725
+ resource: Optional[Dict] = None,
726
+ ):
727
+ f = Path(folder)
728
+ ds = name + ".datasource"
729
+ if os.path.isfile(os.path.join(folder, ds)):
730
+ return ds, None
731
+ if os.path.isfile(f / "datasources" / ds):
732
+ return ds, None
733
+
734
+ pipe = name + ".pipe"
735
+ if os.path.isfile(os.path.join(folder, pipe)):
736
+ return pipe, None
737
+
738
+ if os.path.isfile(f / "endpoints" / pipe):
739
+ return pipe, None
740
+
741
+ if os.path.isfile(f / "pipes" / pipe):
742
+ return pipe, None
743
+
744
+ token = name + ".token"
745
+ if os.path.isfile(f / "tokens" / token):
746
+ return token, None
747
+
748
+ # look for the file in subdirectories if it's not found in datasources folder
749
+ if workspace_lib_paths:
750
+ _resource = None
751
+ for wk_name, wk_path in workspace_lib_paths:
752
+ file = None
753
+ if name.startswith(f"{wk_name}."):
754
+ file, _resource = find_file_by_name(
755
+ wk_path, name.replace(f"{wk_name}.", ""), verbose, is_raw, resource=resource
756
+ )
757
+ if file:
758
+ return file, _resource
759
+
760
+ if not is_raw:
761
+ f, raw = find_file_by_name(
762
+ folder,
763
+ name,
764
+ verbose=verbose,
765
+ is_raw=True,
766
+ workspace_lib_paths=workspace_lib_paths,
767
+ resource=resource,
768
+ )
769
+ return f, raw
770
+
771
+ # materialized node with DATASOURCE definition
772
+ if resource and "nodes" in resource:
773
+ for node in resource["nodes"]:
774
+ params = node.get("params", {})
775
+ if (
776
+ params.get("type", None) == "materialized"
777
+ and params.get("engine", None)
778
+ and params.get("datasource", None)
779
+ ):
780
+ pipe = resource["resource_name"] + ".pipe"
781
+ pipe_file_exists = (
782
+ os.path.isfile(os.path.join(folder, pipe))
783
+ or os.path.isfile(f / "endpoints" / pipe)
784
+ or os.path.isfile(f / "pipes" / pipe)
785
+ )
786
+ is_target_datasource = params["datasource"] == name
787
+ if pipe_file_exists and is_target_datasource:
788
+ return pipe, {"resource_name": params.get("datasource")}
789
+
790
+ if verbose:
791
+ click.echo(FeedbackManager.warning_file_not_found_inside(name=name, folder=folder))
792
+
793
+ return None, None
794
+
795
+
796
+ def get_name_version(ds: str) -> Dict[str, Any]:
797
+ """
798
+ Given a name like "name__dev__v0" returns ['name', 'dev', 'v0']
799
+ >>> get_name_version('dev__name__v0')
800
+ {'name': 'dev__name', 'version': 0}
801
+ >>> get_name_version('name__v0')
802
+ {'name': 'name', 'version': 0}
803
+ >>> get_name_version('dev__name')
804
+ {'name': 'dev__name', 'version': None}
805
+ >>> get_name_version('name')
806
+ {'name': 'name', 'version': None}
807
+ >>> get_name_version('horario__3__pipe')
808
+ {'name': 'horario__3__pipe', 'version': None}
809
+ >>> get_name_version('horario__checker')
810
+ {'name': 'horario__checker', 'version': None}
811
+ >>> get_name_version('dev__horario__checker')
812
+ {'name': 'dev__horario__checker', 'version': None}
813
+ >>> get_name_version('tg__dActividades__v0_pipe_3907')
814
+ {'name': 'tg__dActividades', 'version': 0}
815
+ >>> get_name_version('tg__dActividades__va_pipe_3907')
816
+ {'name': 'tg__dActividades__va_pipe_3907', 'version': None}
817
+ >>> get_name_version('tg__origin_workspace.shared_ds__v3907')
818
+ {'name': 'tg__origin_workspace.shared_ds', 'version': 3907}
819
+ >>> get_name_version('tmph8egtl__')
820
+ {'name': 'tmph8egtl__', 'version': None}
821
+ >>> get_name_version('tmph8egtl__123__')
822
+ {'name': 'tmph8egtl__123__', 'version': None}
823
+ >>> get_name_version('dev__name__v0')
824
+ {'name': 'dev__name', 'version': 0}
825
+ >>> get_name_version('name__v0')
826
+ {'name': 'name', 'version': 0}
827
+ >>> get_name_version('dev__name')
828
+ {'name': 'dev__name', 'version': None}
829
+ >>> get_name_version('name')
830
+ {'name': 'name', 'version': None}
831
+ >>> get_name_version('horario__3__pipe')
832
+ {'name': 'horario__3__pipe', 'version': None}
833
+ >>> get_name_version('horario__checker')
834
+ {'name': 'horario__checker', 'version': None}
835
+ >>> get_name_version('dev__horario__checker')
836
+ {'name': 'dev__horario__checker', 'version': None}
837
+ >>> get_name_version('tg__dActividades__v0_pipe_3907')
838
+ {'name': 'tg__dActividades', 'version': 0}
839
+ >>> get_name_version('tg__origin_workspace.shared_ds__v3907')
840
+ {'name': 'tg__origin_workspace.shared_ds', 'version': 3907}
841
+ >>> get_name_version('tmph8egtl__')
842
+ {'name': 'tmph8egtl__', 'version': None}
843
+ >>> get_name_version('tmph8egtl__123__')
844
+ {'name': 'tmph8egtl__123__', 'version': None}
845
+ """
846
+ tk = ds.rsplit("__", 2)
847
+ if len(tk) == 1:
848
+ return {"name": tk[0], "version": None}
849
+ elif len(tk) == 2:
850
+ if len(tk[1]):
851
+ if tk[1][0] == "v" and re.match("[0-9]+$", tk[1][1:]):
852
+ return {"name": tk[0], "version": int(tk[1][1:])}
853
+ else:
854
+ return {"name": tk[0] + "__" + tk[1], "version": None}
855
+ elif len(tk) == 3 and len(tk[2]):
856
+ if tk[2] == "checker":
857
+ return {"name": tk[0] + "__" + tk[1] + "__" + tk[2], "version": None}
858
+ if tk[2][0] == "v":
859
+ parts = tk[2].split("_")
860
+ try:
861
+ return {"name": tk[0] + "__" + tk[1], "version": int(parts[0][1:])}
862
+ except ValueError:
863
+ return {"name": tk[0] + "__" + tk[1] + "__" + tk[2], "version": None}
864
+ else:
865
+ return {"name": "__".join(tk[0:]), "version": None}
866
+
867
+ return {"name": ds, "version": None}
868
+
869
+
870
+ def get_resource_versions(datasources: List[str]):
871
+ """
872
+ return the latest version for all the datasources
873
+ """
874
+ versions = {}
875
+ for x in datasources:
876
+ t = get_name_version(x)
877
+ name = t["name"]
878
+ if t.get("version", None) is not None:
879
+ versions[name] = t["version"]
880
+ return versions
881
+
882
+
883
+ def is_file_a_datasource(filename: str) -> bool:
884
+ extensions = Path(filename).suffixes
885
+ if ".datasource" in extensions: # Accepts '.datasource' and '.datasource.incl'
886
+ return True
887
+
888
+ if ".incl" in extensions:
889
+ lines = []
890
+ with open(filename) as file:
891
+ lines = file.readlines()
892
+
893
+ for line in lines:
894
+ trimmed_line = line.strip().lower()
895
+ if trimmed_line.startswith("schema") or trimmed_line.startswith("engine"):
896
+ return True
897
+
898
+ return False