ingestr 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main.py CHANGED
@@ -1,19 +1,14 @@
1
- import hashlib
2
- import tempfile
3
1
  from datetime import datetime
4
2
  from enum import Enum
5
3
  from typing import Optional
6
4
 
7
- import dlt
8
- import humanize
9
5
  import typer
10
- from dlt.common.pipeline import LoadInfo
11
- from dlt.common.runtime.collector import Collector, LogCollector
6
+ from dlt.common.runtime.collector import Collector
12
7
  from rich.console import Console
13
8
  from rich.status import Status
14
9
  from typing_extensions import Annotated
15
10
 
16
- from ingestr.src.factory import SourceDestinationFactory
11
+ from ingestr.src.filters import cast_set_to_list
17
12
  from ingestr.src.telemetry.event import track
18
13
 
19
14
  app = typer.Typer(
@@ -118,6 +113,12 @@ class SchemaNaming(str, Enum):
118
113
  direct = "direct"
119
114
 
120
115
 
116
+ class SqlReflectionLevel(str, Enum):
117
+ minimal = "minimal"
118
+ full = "full"
119
+ full_with_precision = "full_with_precision"
120
+
121
+
121
122
  @app.command()
122
123
  def ingest(
123
124
  source_uri: Annotated[
@@ -181,6 +182,20 @@ def ingest(
181
182
  envvar="PRIMARY_KEY",
182
183
  ),
183
184
  ] = None, # type: ignore
185
+ partition_by: Annotated[
186
+ Optional[str],
187
+ typer.Option(
188
+ help="The partition key to be used for partitioning the destination table",
189
+ envvar="PARTITION_BY",
190
+ ),
191
+ ] = None, # type: ignore
192
+ cluster_by: Annotated[
193
+ Optional[str],
194
+ typer.Option(
195
+ help="The clustering key to be used for clustering the destination table, not every destination supports clustering.",
196
+ envvar="CLUSTER_BY",
197
+ ),
198
+ ] = None, # type: ignore
184
199
  yes: Annotated[
185
200
  Optional[bool],
186
201
  typer.Option(
@@ -251,7 +266,88 @@ def ingest(
251
266
  envvar="EXTRACT_PARALLELISM",
252
267
  ),
253
268
  ] = 5, # type: ignore
269
+ sql_reflection_level: Annotated[
270
+ SqlReflectionLevel,
271
+ typer.Option(
272
+ help="The reflection level to use when reflecting the table schema from the source",
273
+ envvar="SQL_REFLECTION_LEVEL",
274
+ ),
275
+ ] = SqlReflectionLevel.full, # type: ignore
276
+ sql_limit: Annotated[
277
+ Optional[int],
278
+ typer.Option(
279
+ help="The limit to use when fetching data from the source",
280
+ envvar="SQL_LIMIT",
281
+ ),
282
+ ] = None, # type: ignore
283
+ sql_exclude_columns: Annotated[
284
+ Optional[list[str]],
285
+ typer.Option(
286
+ help="The columns to exclude from the source table",
287
+ envvar="SQL_EXCLUDE_COLUMNS",
288
+ ),
289
+ ] = [], # type: ignore
254
290
  ):
291
+ import hashlib
292
+ import tempfile
293
+ from datetime import datetime
294
+
295
+ import dlt
296
+ import humanize
297
+ import typer
298
+ from dlt.common.destination import Destination
299
+ from dlt.common.pipeline import LoadInfo
300
+ from dlt.common.runtime.collector import Collector, LogCollector
301
+ from dlt.common.schema.typing import TColumnSchema
302
+
303
+ from ingestr.src.factory import SourceDestinationFactory
304
+ from ingestr.src.telemetry.event import track
305
+
306
+ def report_errors(run_info: LoadInfo):
307
+ for load_package in run_info.load_packages:
308
+ failed_jobs = load_package.jobs["failed_jobs"]
309
+ if len(failed_jobs) == 0:
310
+ continue
311
+
312
+ print()
313
+ print("[bold red]Failed jobs:[/bold red]")
314
+ print()
315
+ for job in failed_jobs:
316
+ print(f"[bold red] {job.job_file_info.job_id()}[/bold red]")
317
+ print(f" [bold yellow]Error:[/bold yellow] {job.failed_message}")
318
+
319
+ raise typer.Exit(1)
320
+
321
+ def validate_source_dest_tables(
322
+ source_table: str, dest_table: str
323
+ ) -> tuple[str, str]:
324
+ if not dest_table:
325
+ if len(source_table.split(".")) != 2:
326
+ print(
327
+ "[red]Table name must be in the format schema.table for source table when dest-table is not given.[/red]"
328
+ )
329
+ raise typer.Abort()
330
+
331
+ print()
332
+ print(
333
+ "[yellow]Destination table is not given, defaulting to the source table.[/yellow]"
334
+ )
335
+ dest_table = source_table
336
+ return (source_table, dest_table)
337
+
338
+ def validate_loader_file_format(
339
+ dlt_dest: Destination, loader_file_format: Optional[LoaderFileFormat]
340
+ ):
341
+ if (
342
+ loader_file_format
343
+ and loader_file_format.value
344
+ not in dlt_dest.capabilities().supported_loader_file_formats
345
+ ):
346
+ print(
347
+ f"[red]Loader file format {loader_file_format.value} is not supported by the destination.[/red]"
348
+ )
349
+ raise typer.Abort()
350
+
255
351
  track(
256
352
  "command_triggered",
257
353
  {
@@ -259,6 +355,13 @@ def ingest(
259
355
  },
260
356
  )
261
357
 
358
+ clean_sql_exclude_columns = []
359
+ if sql_exclude_columns:
360
+ for col in sql_exclude_columns:
361
+ for possible_col in col.split(","):
362
+ clean_sql_exclude_columns.append(possible_col.strip())
363
+ sql_exclude_columns = clean_sql_exclude_columns
364
+
262
365
  dlt.config["data_writer.buffer_max_items"] = page_size
263
366
  dlt.config["data_writer.file_max_items"] = loader_file_size
264
367
  dlt.config["extract.workers"] = extract_parallelism
@@ -267,29 +370,23 @@ def ingest(
267
370
  dlt.config["schema.naming"] = schema_naming.value
268
371
 
269
372
  try:
270
- if not dest_table:
271
- if len(source_table.split(".")) != 2:
272
- print(
273
- "[red]Table name must be in the format schema.table for source table when dest-table is not given.[/red]"
274
- )
275
- raise typer.Abort()
276
-
277
- print()
278
- print(
279
- "[yellow]Destination table is not given, defaulting to the source table.[/yellow]"
280
- )
281
- dest_table = source_table
373
+ (source_table, dest_table) = validate_source_dest_tables(
374
+ source_table, dest_table
375
+ )
282
376
 
283
377
  factory = SourceDestinationFactory(source_uri, dest_uri)
284
378
  source = factory.get_source()
285
379
  destination = factory.get_destination()
286
380
 
381
+ column_hints: dict[str, TColumnSchema] = {}
287
382
  original_incremental_strategy = incremental_strategy
288
383
 
289
384
  merge_key = None
290
385
  if incremental_strategy == IncrementalStrategy.delete_insert:
291
386
  merge_key = incremental_key
292
387
  incremental_strategy = IncrementalStrategy.merge
388
+ if incremental_key:
389
+ column_hints[incremental_key] = {"merge_key": True}
293
390
 
294
391
  m = hashlib.sha256()
295
392
  m.update(dest_table.encode("utf-8"))
@@ -303,11 +400,31 @@ def ingest(
303
400
  pipelines_dir = tempfile.mkdtemp()
304
401
  is_pipelines_dir_temp = True
305
402
 
403
+ dlt_dest = destination.dlt_dest(uri=dest_uri)
404
+ validate_loader_file_format(dlt_dest, loader_file_format)
405
+
406
+ if partition_by:
407
+ if partition_by not in column_hints:
408
+ column_hints[partition_by] = {}
409
+
410
+ column_hints[partition_by]["partition"] = True
411
+
412
+ if cluster_by:
413
+ if cluster_by not in column_hints:
414
+ column_hints[cluster_by] = {}
415
+
416
+ column_hints[cluster_by]["cluster"] = True
417
+
418
+ if primary_key:
419
+ for key in primary_key:
420
+ if key not in column_hints:
421
+ column_hints[key] = {}
422
+
423
+ column_hints[key]["primary_key"] = True
424
+
306
425
  pipeline = dlt.pipeline(
307
426
  pipeline_name=m.hexdigest(),
308
- destination=destination.dlt_dest(
309
- uri=dest_uri,
310
- ),
427
+ destination=dlt_dest,
311
428
  progress=progressInstance,
312
429
  pipelines_dir=pipelines_dir,
313
430
  refresh="drop_resources" if full_refresh else None,
@@ -365,8 +482,18 @@ def ingest(
365
482
  interval_end=interval_end,
366
483
  sql_backend=sql_backend.value,
367
484
  page_size=page_size,
485
+ sql_reflection_level=sql_reflection_level.value,
486
+ sql_limit=sql_limit,
487
+ sql_exclude_columns=sql_exclude_columns,
368
488
  )
369
489
 
490
+ if hasattr(dlt_source, "selected_resources") and dlt_source.selected_resources:
491
+ resource_names = list(dlt_source.selected_resources.keys())
492
+ for res in resource_names:
493
+ dlt_source.resources[res].add_map(cast_set_to_list)
494
+ else:
495
+ dlt_source.add_map(cast_set_to_list)
496
+
370
497
  if original_incremental_strategy == IncrementalStrategy.delete_insert:
371
498
  dlt_source.incremental.primary_key = ()
372
499
 
@@ -397,32 +524,21 @@ def ingest(
397
524
  ),
398
525
  write_disposition=write_disposition, # type: ignore
399
526
  primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
400
- loader_file_format=loader_file_format.value
401
- if loader_file_format is not None
402
- else None, # type: ignore
527
+ loader_file_format=(
528
+ loader_file_format.value if loader_file_format is not None else None # type: ignore
529
+ ), # type: ignore
530
+ columns=column_hints,
403
531
  )
404
532
 
405
- for load_package in run_info.load_packages:
406
- failed_jobs = load_package.jobs["failed_jobs"]
407
- if len(failed_jobs) > 0:
408
- print()
409
- print("[bold red]Failed jobs:[/bold red]")
410
- print()
411
- for job in failed_jobs:
412
- print(f"[bold red] {job.job_file_info.job_id()}[/bold red]")
413
- print(f" [bold yellow]Error:[/bold yellow] {job.failed_message}")
414
-
415
- raise typer.Exit(1)
533
+ report_errors(run_info)
416
534
 
417
535
  destination.post_load()
418
536
 
419
537
  end_time = datetime.now()
420
538
  elapsedHuman = ""
421
- if run_info.started_at:
422
- elapsed = end_time - start_time
423
- elapsedHuman = f"in {humanize.precisedelta(elapsed)}"
539
+ elapsed = end_time - start_time
540
+ elapsedHuman = f"in {humanize.precisedelta(elapsed)}"
424
541
 
425
- # remove the pipelines_dir folder if it was created by ingestr
426
542
  if is_pipelines_dir_temp:
427
543
  import shutil
428
544
 
@@ -82,7 +82,7 @@ def adjust_source(
82
82
  type_hints[metric] = KNOWN_TYPE_HINTS[metric]
83
83
 
84
84
  @dlt.resource(
85
- write_disposition={"disposition": "merge", "strategy": "delete+insert"},
85
+ write_disposition={"disposition": "merge", "strategy": "delete-insert"},
86
86
  merge_key=merge_key,
87
87
  primary_key=dimensions,
88
88
  columns=type_hints,
ingestr/src/filters.py ADDED
@@ -0,0 +1,21 @@
1
+ from dlt.common.libs.sql_alchemy import Table
2
+
3
+
4
+ def cast_set_to_list(row):
5
+ # this handles just the sqlalchemy backend for now
6
+ if isinstance(row, dict):
7
+ for key in row.keys():
8
+ if isinstance(row[key], set):
9
+ row[key] = list(row[key])
10
+ return row
11
+
12
+
13
+ def table_adapter_exclude_columns(cols: list[str]):
14
+ print("given cols", cols)
15
+
16
+ def excluder(table: Table):
17
+ cols_to_remove = [col for col in table._columns if col.name in cols] # type: ignore
18
+ for col in cols_to_remove:
19
+ table._columns.remove(col) # type: ignore
20
+
21
+ return excluder
@@ -99,12 +99,12 @@ def gorgias_source(
99
99
  "description": "When the user was last updated.",
100
100
  },
101
101
  "meta": {
102
- "data_type": "complex",
102
+ "data_type": "json",
103
103
  "nullable": True,
104
104
  "description": "Meta information associated with the user.",
105
105
  },
106
106
  "data": {
107
- "data_type": "complex",
107
+ "data_type": "json",
108
108
  "nullable": True,
109
109
  "description": "Additional data associated with the user.",
110
110
  },
@@ -185,17 +185,17 @@ def gorgias_source(
185
185
  "description": "Indicates if the ticket was created by an agent",
186
186
  },
187
187
  "customer": {
188
- "data_type": "complex",
188
+ "data_type": "json",
189
189
  "nullable": False,
190
190
  "description": "The customer linked to the ticket.",
191
191
  },
192
192
  "assignee_user": {
193
- "data_type": "complex",
193
+ "data_type": "json",
194
194
  "nullable": True,
195
195
  "description": "User assigned to the ticket",
196
196
  },
197
197
  "assignee_team": {
198
- "data_type": "complex",
198
+ "data_type": "json",
199
199
  "nullable": True,
200
200
  "description": "Team assigned to the ticket",
201
201
  },
@@ -210,17 +210,17 @@ def gorgias_source(
210
210
  "description": "Excerpt of the ticket",
211
211
  },
212
212
  "integrations": {
213
- "data_type": "complex",
213
+ "data_type": "json",
214
214
  "nullable": False,
215
215
  "description": "Integration information related to the ticket",
216
216
  },
217
217
  "meta": {
218
- "data_type": "complex",
218
+ "data_type": "json",
219
219
  "nullable": True,
220
220
  "description": "Meta information related to the ticket",
221
221
  },
222
222
  "tags": {
223
- "data_type": "complex",
223
+ "data_type": "json",
224
224
  "nullable": False,
225
225
  "description": "Tags associated with the ticket",
226
226
  },
@@ -354,7 +354,7 @@ def gorgias_source(
354
354
  "description": "How the message has been received, or sent from Gorgias.",
355
355
  },
356
356
  "sender": {
357
- "data_type": "complex",
357
+ "data_type": "json",
358
358
  "nullable": False,
359
359
  "description": "The person who sent the message. It can be a user or a customer.",
360
360
  },
@@ -364,7 +364,7 @@ def gorgias_source(
364
364
  "description": "ID of the integration that either received or sent the message.",
365
365
  },
366
366
  "intents": {
367
- "data_type": "complex",
367
+ "data_type": "json",
368
368
  "nullable": True,
369
369
  "description": "",
370
370
  },
@@ -379,7 +379,7 @@ def gorgias_source(
379
379
  "description": "Whether the message was sent by your company to a customer, or the opposite.",
380
380
  },
381
381
  "receiver": {
382
- "data_type": "complex",
382
+ "data_type": "json",
383
383
  "nullable": True,
384
384
  "description": "The primary receiver of the message. It can be a user or a customer. Optional when the source type is 'internal-note'.",
385
385
  },
@@ -414,27 +414,27 @@ def gorgias_source(
414
414
  "description": "",
415
415
  },
416
416
  "headers": {
417
- "data_type": "complex",
417
+ "data_type": "json",
418
418
  "nullable": True,
419
419
  "description": "Headers of the message",
420
420
  },
421
421
  "attachments": {
422
- "data_type": "complex",
422
+ "data_type": "json",
423
423
  "nullable": True,
424
424
  "description": "A list of files attached to the message.",
425
425
  },
426
426
  "actions": {
427
- "data_type": "complex",
427
+ "data_type": "json",
428
428
  "nullable": True,
429
429
  "description": "A list of actions performed on the message.",
430
430
  },
431
431
  "macros": {
432
- "data_type": "complex",
432
+ "data_type": "json",
433
433
  "nullable": True,
434
434
  "description": "A list of macros",
435
435
  },
436
436
  "meta": {
437
- "data_type": "complex",
437
+ "data_type": "json",
438
438
  "nullable": True,
439
439
  "description": "Message metadata",
440
440
  },
@@ -526,7 +526,7 @@ def gorgias_source(
526
526
  "description": "ID of the customer linked to the survey.",
527
527
  },
528
528
  "meta": {
529
- "data_type": "complex",
529
+ "data_type": "json",
530
530
  "nullable": True,
531
531
  "description": "Meta information associated with the survey.",
532
532
  },