ingestr 0.13.78__py3-none-any.whl → 0.13.79__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main.py CHANGED
@@ -1,3 +1,4 @@
1
+ import warnings
1
2
  from datetime import datetime
2
3
  from enum import Enum
3
4
  from typing import Optional
@@ -8,6 +9,14 @@ from typing_extensions import Annotated
8
9
 
9
10
  from ingestr.src.telemetry.event import track
10
11
 
12
+ try:
13
+ from duckdb_engine import DuckDBEngineWarning
14
+
15
+ warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
16
+ except ImportError:
17
+ # duckdb-engine not installed
18
+ pass
19
+
11
20
  app = typer.Typer(
12
21
  name="ingestr",
13
22
  help="ingestr is the CLI tool to ingest data from one source to another",
@@ -506,7 +515,6 @@ def ingest(
506
515
 
507
516
  if factory.source_scheme == "sqlite":
508
517
  source_table = "main." + source_table.split(".")[-1]
509
-
510
518
 
511
519
  if (
512
520
  incremental_key
@@ -600,10 +608,9 @@ def ingest(
600
608
  if factory.source_scheme == "influxdb":
601
609
  if primary_key:
602
610
  write_disposition = "merge"
603
-
604
611
 
605
612
  start_time = datetime.now()
606
-
613
+
607
614
  run_info: LoadInfo = pipeline.run(
608
615
  dlt_source,
609
616
  **destination.dlt_run_params(
ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.78"
1
+ version = "v0.13.79"
@@ -147,6 +147,24 @@ class DuckDBDestination(GenericSqlDestination):
147
147
  return dlt.destinations.duckdb(uri, **kwargs)
148
148
 
149
149
 
150
+ class MotherduckDestination(GenericSqlDestination):
151
+ def dlt_dest(self, uri: str, **kwargs):
152
+ from urllib.parse import parse_qs, urlparse
153
+
154
+ parsed = urlparse(uri)
155
+ query = parse_qs(parsed.query)
156
+ token = query.get("token", [None])[0]
157
+ from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
158
+
159
+ creds = {
160
+ "password": token,
161
+ }
162
+ if parsed.path.lstrip("/"):
163
+ creds["database"] = parsed.path.lstrip("/")
164
+
165
+ return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
166
+
167
+
150
168
  def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
151
169
  # ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
152
170
  tup = struct.unpack(
@@ -26,7 +26,6 @@ from .settings import (
26
26
  DEFAULT_LEAD_FIELDS,
27
27
  INSIGHT_FIELDS_TYPES,
28
28
  INSIGHTS_BREAKDOWNS_OPTIONS,
29
- INSIGHTS_PRIMARY_KEY,
30
29
  INVALID_INSIGHTS_FIELDS,
31
30
  TInsightsBreakdownOptions,
32
31
  TInsightsLevels,
ingestr/src/factory.py CHANGED
@@ -12,6 +12,7 @@ from ingestr.src.destinations import (
12
12
  DatabricksDestination,
13
13
  DuckDBDestination,
14
14
  GCSDestination,
15
+ MotherduckDestination,
15
16
  MsSQLDestination,
16
17
  MySqlDestination,
17
18
  PostgresDestination,
@@ -85,6 +86,8 @@ SQL_SOURCE_SCHEMES = [
85
86
  "mysql",
86
87
  "mysql+pymysql",
87
88
  "mysql+mysqlconnector",
89
+ "md",
90
+ "motherduck",
88
91
  "postgres",
89
92
  "postgresql",
90
93
  "postgresql+psycopg2",
@@ -195,6 +198,8 @@ class SourceDestinationFactory:
195
198
  "cratedb": CrateDBDestination,
196
199
  "databricks": DatabricksDestination,
197
200
  "duckdb": DuckDBDestination,
201
+ "motherduck": MotherduckDestination,
202
+ "md": MotherduckDestination,
198
203
  "mssql": MsSQLDestination,
199
204
  "postgres": PostgresDestination,
200
205
  "postgresql": PostgresDestination,
@@ -4,6 +4,8 @@ etc. to the database"""
4
4
  from typing import Any, Dict, Generator, Iterable, List, Optional
5
5
 
6
6
  import dlt
7
+ import pendulum
8
+ from dlt.common.time import ensure_pendulum_datetime
7
9
  from dlt.sources import DltResource
8
10
 
9
11
  from .freshdesk_client import FreshdeskClient
@@ -12,10 +14,12 @@ from .settings import DEFAULT_ENDPOINTS
12
14
 
13
15
  @dlt.source()
14
16
  def freshdesk_source(
15
- endpoints: Optional[List[str]] = None,
17
+ domain: str,
18
+ api_secret_key: str,
19
+ start_date: pendulum.DateTime,
20
+ end_date: Optional[pendulum.DateTime] = None,
16
21
  per_page: int = 100,
17
- domain: str = dlt.secrets.value,
18
- api_secret_key: str = dlt.secrets.value,
22
+ endpoints: Optional[List[str]] = None,
19
23
  ) -> Iterable[DltResource]:
20
24
  """
21
25
  Retrieves data from specified Freshdesk API endpoints.
@@ -39,7 +43,11 @@ def freshdesk_source(
39
43
  def incremental_resource(
40
44
  endpoint: str,
41
45
  updated_at: Optional[Any] = dlt.sources.incremental(
42
- "updated_at", initial_value="2022-01-01T00:00:00Z"
46
+ "updated_at",
47
+ initial_value=start_date.isoformat(),
48
+ end_value=end_date.isoformat() if end_date else None,
49
+ range_start="closed",
50
+ range_end="closed",
43
51
  ),
44
52
  ) -> Generator[Dict[Any, Any], Any, None]:
45
53
  """
@@ -48,15 +56,22 @@ def freshdesk_source(
48
56
  to ensure incremental loading.
49
57
  """
50
58
 
51
- # Retrieve the last updated timestamp to fetch only new or updated records.
52
- if updated_at is not None:
53
- updated_at = updated_at.last_value
59
+ if updated_at.last_value is not None:
60
+ start_date = ensure_pendulum_datetime(updated_at.last_value)
61
+ else:
62
+ start_date = start_date
63
+
64
+ if updated_at.end_value is not None:
65
+ end_date = ensure_pendulum_datetime(updated_at.end_value)
66
+ else:
67
+ end_date = pendulum.now(tz="UTC")
54
68
 
55
69
  # Use the FreshdeskClient instance to fetch paginated responses
56
70
  yield from freshdesk.paginated_response(
57
71
  endpoint=endpoint,
58
72
  per_page=per_page,
59
- updated_at=updated_at,
73
+ start_date=start_date,
74
+ end_date=end_date,
60
75
  )
61
76
 
62
77
  # Set default endpoints if not provided
@@ -2,8 +2,9 @@
2
2
 
3
3
  import logging
4
4
  import time
5
- from typing import Any, Dict, Iterable, Optional
5
+ from typing import Any, Dict, Iterable
6
6
 
7
+ import pendulum
7
8
  from dlt.common.typing import TDataItem
8
9
  from dlt.sources.helpers import requests
9
10
 
@@ -67,7 +68,8 @@ class FreshdeskClient:
67
68
  self,
68
69
  endpoint: str,
69
70
  per_page: int,
70
- updated_at: Optional[str] = None,
71
+ start_date: pendulum.DateTime,
72
+ end_date: pendulum.DateTime,
71
73
  ) -> Iterable[TDataItem]:
72
74
  """
73
75
  Fetches a paginated response from a specified endpoint.
@@ -88,8 +90,8 @@ class FreshdeskClient:
88
90
  param_key = (
89
91
  "updated_since" if endpoint == "tickets" else "_updated_since"
90
92
  )
91
- if updated_at:
92
- params[param_key] = updated_at
93
+
94
+ params[param_key] = start_date.to_iso8601_string()
93
95
 
94
96
  # Handle requests with rate-limiting
95
97
  # A maximum of 300 pages (30000 tickets) will be returned.
@@ -98,5 +100,14 @@ class FreshdeskClient:
98
100
 
99
101
  if not data:
100
102
  break # Stop if no data or max page limit reached
101
- yield data
103
+
104
+ filtered_data = [
105
+ item
106
+ for item in data
107
+ if "updated_at" in item
108
+ and pendulum.parse(item["updated_at"]) <= end_date
109
+ ]
110
+ if not filtered_data:
111
+ break
112
+ yield filtered_data
102
113
  page += 1
@@ -91,7 +91,9 @@ def github_repo_events(
91
91
  """
92
92
 
93
93
  # use naming function in table name to generate separate tables for each event
94
- @dlt.resource(primary_key= "id", table_name=lambda i: i["type"], write_disposition="merge")
94
+ @dlt.resource(
95
+ primary_key="id", table_name=lambda i: i["type"], write_disposition="merge"
96
+ )
95
97
  def repo_events(
96
98
  last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
97
99
  "created_at",
@@ -105,7 +107,7 @@ def github_repo_events(
105
107
  repos_path = (
106
108
  f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
107
109
  )
108
-
110
+
109
111
  # Get the date range from the incremental state
110
112
  start_filter = pendulum.parse(
111
113
  last_created_at.last_value or last_created_at.initial_value
@@ -115,7 +117,7 @@ def github_repo_events(
115
117
  if last_created_at.end_value
116
118
  else pendulum.now()
117
119
  )
118
-
120
+
119
121
  for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
120
122
  # Filter events by date range
121
123
  filtered_events = []
@@ -61,6 +61,7 @@ def get_stargazers(
61
61
  page_items,
62
62
  )
63
63
 
64
+
64
65
  def get_reactions_data(
65
66
  node_type: str,
66
67
  owner: str,
@@ -7,6 +7,7 @@ from dlt.sources import DltResource
7
7
 
8
8
  from .client import InfluxClient
9
9
 
10
+
10
11
  @dlt.source(max_table_nesting=0)
11
12
  def influxdb_source(
12
13
  measurement: str,
@@ -106,6 +106,7 @@ def mongodb_collection(
106
106
  filter_: Optional[Dict[str, Any]] = None,
107
107
  projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
108
108
  pymongoarrow_schema: Optional[Any] = None,
109
+ custom_query: Optional[List[Dict[str, Any]]] = None,
109
110
  ) -> Any:
110
111
  """
111
112
  A DLT source which loads a collection from a mongo database using PyMongo.
@@ -132,6 +133,7 @@ def mongodb_collection(
132
133
  exclude (dict) - {"released": False, "runtime": False}
133
134
  Note: Can't mix include and exclude statements '{"title": True, "released": False}`
134
135
  pymongoarrow_schema (pymongoarrow.schema.Schema): Mapping of expected field types to convert BSON to Arrow
136
+ custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
135
137
 
136
138
  Returns:
137
139
  Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -161,4 +163,5 @@ def mongodb_collection(
161
163
  filter_=filter_ or {},
162
164
  projection=projection,
163
165
  pymongoarrow_schema=pymongoarrow_schema,
166
+ custom_query=custom_query,
164
167
  )
@@ -1,6 +1,5 @@
1
1
  """Mongo database source helpers"""
2
2
 
3
- import json
4
3
  from itertools import islice
5
4
  from typing import (
6
5
  TYPE_CHECKING,
@@ -209,7 +208,7 @@ class CollectionLoader:
209
208
  if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
210
209
  yield dlt.mark.with_hints(
211
210
  res,
212
- dlt.mark.make_hints(columns={"_id": {"data_type": "json"} }),
211
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
213
212
  )
214
213
  else:
215
214
  yield res
@@ -472,6 +471,145 @@ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
472
471
  yield convert_arrow_columns(table)
473
472
 
474
473
 
474
+ class CollectionAggregationLoader(CollectionLoader):
475
+ """
476
+ MongoDB collection loader that uses aggregation pipelines instead of find queries.
477
+ """
478
+
479
+ def __init__(
480
+ self,
481
+ client: TMongoClient,
482
+ collection: TCollection,
483
+ chunk_size: int,
484
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
485
+ ) -> None:
486
+ super().__init__(client, collection, chunk_size, incremental)
487
+ self.custom_query: Optional[List[Dict[str, Any]]] = None
488
+
489
+ def set_custom_query(self, query: List[Dict[str, Any]]):
490
+ """Set the custom aggregation pipeline query"""
491
+ self.custom_query = query
492
+
493
+ def load_documents(
494
+ self,
495
+ filter_: Dict[str, Any],
496
+ limit: Optional[int] = None,
497
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
498
+ ) -> Iterator[TDataItem]:
499
+ """Load documents using aggregation pipeline"""
500
+ if not self.custom_query:
501
+ # Fallback to parent method if no custom query
502
+ yield from super().load_documents(filter_, limit, projection)
503
+ return
504
+
505
+ # Build aggregation pipeline
506
+ pipeline = list(self.custom_query) # Copy the query
507
+
508
+ # For custom queries, we assume incremental filtering is already handled
509
+ # via interval placeholders (:interval_start, :interval_end) in the query itself.
510
+ # We don't add additional incremental filtering to avoid conflicts.
511
+
512
+ # Add additional filter if provided
513
+ if filter_:
514
+ filter_match = {"$match": filter_}
515
+ pipeline.insert(0, filter_match)
516
+
517
+ # Add limit if specified
518
+ if limit and limit > 0:
519
+ pipeline.append({"$limit": limit})
520
+
521
+ print("pipeline", pipeline)
522
+ # Execute aggregation
523
+ cursor = self.collection.aggregate(pipeline, allowDiskUse=True)
524
+
525
+ # Process results in chunks
526
+ while docs_slice := list(islice(cursor, self.chunk_size)):
527
+ res = map_nested_in_place(convert_mongo_objs, docs_slice)
528
+ print("res", res)
529
+ if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
530
+ yield dlt.mark.with_hints(
531
+ res,
532
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
533
+ )
534
+ else:
535
+ yield res
536
+
537
+
538
+ class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
539
+ """
540
+ MongoDB collection parallel loader that uses aggregation pipelines.
541
+ Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
542
+ Falls back to sequential loading.
543
+ """
544
+
545
+ def load_documents(
546
+ self,
547
+ filter_: Dict[str, Any],
548
+ limit: Optional[int] = None,
549
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
550
+ ) -> Iterator[TDataItem]:
551
+ """Load documents using aggregation pipeline (sequential only)"""
552
+ logger.warning(
553
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
554
+ )
555
+ yield from super().load_documents(filter_, limit, projection)
556
+
557
+
558
+ class CollectionAggregationArrowLoader(CollectionAggregationLoader):
559
+ """
560
+ MongoDB collection aggregation loader that uses Apache Arrow for data processing.
561
+ """
562
+
563
+ def load_documents(
564
+ self,
565
+ filter_: Dict[str, Any],
566
+ limit: Optional[int] = None,
567
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
568
+ pymongoarrow_schema: Any = None,
569
+ ) -> Iterator[Any]:
570
+ """Load documents using aggregation pipeline with Arrow format"""
571
+ logger.warning(
572
+ "Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
573
+ )
574
+
575
+ # Load documents normally and convert to arrow format
576
+ for batch in super().load_documents(filter_, limit, projection):
577
+ if batch: # Only process non-empty batches
578
+ try:
579
+ from dlt.common.libs.pyarrow import pyarrow
580
+
581
+ # Convert dict batch to arrow table
582
+ table = pyarrow.Table.from_pylist(batch)
583
+ yield convert_arrow_columns(table)
584
+ except ImportError:
585
+ logger.warning(
586
+ "PyArrow not available, falling back to object format"
587
+ )
588
+ yield batch
589
+
590
+
591
+ class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
592
+ """
593
+ MongoDB collection parallel aggregation loader with Arrow support.
594
+ Falls back to sequential loading.
595
+ """
596
+
597
+ def load_documents(
598
+ self,
599
+ filter_: Dict[str, Any],
600
+ limit: Optional[int] = None,
601
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
602
+ pymongoarrow_schema: Any = None,
603
+ ) -> Iterator[TDataItem]:
604
+ """Load documents using aggregation pipeline with Arrow format (sequential only)"""
605
+ logger.warning(
606
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
607
+ )
608
+ yield from super().load_documents(
609
+ filter_, limit, projection, pymongoarrow_schema
610
+ )
611
+
612
+
475
613
  def collection_documents(
476
614
  client: TMongoClient,
477
615
  collection: TCollection,
@@ -483,6 +621,7 @@ def collection_documents(
483
621
  limit: Optional[int] = None,
484
622
  chunk_size: Optional[int] = 10000,
485
623
  data_item_format: Optional[TDataItemFormat] = "object",
624
+ custom_query: Optional[List[Dict[str, Any]]] = None,
486
625
  ) -> Iterator[TDataItem]:
487
626
  """
488
627
  A DLT source which loads data from a Mongo database using PyMongo.
@@ -507,6 +646,7 @@ def collection_documents(
507
646
  Supported formats:
508
647
  object - Python objects (dicts, lists).
509
648
  arrow - Apache Arrow tables.
649
+ custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
510
650
 
511
651
  Returns:
512
652
  Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -529,21 +669,48 @@ def collection_documents(
529
669
  "create a projection to select fields, `projection` will be ignored."
530
670
  )
531
671
 
532
- if parallel:
533
- if data_item_format == "arrow":
534
- LoaderClass = CollectionArrowLoaderParallel
672
+ # If custom query is provided, use aggregation loaders
673
+ if custom_query:
674
+ if parallel:
675
+ if data_item_format == "arrow":
676
+ LoaderClass = CollectionAggregationArrowLoaderParallel
677
+ else:
678
+ LoaderClass = CollectionAggregationLoaderParallel # type: ignore
535
679
  else:
536
- LoaderClass = CollectionLoaderParallel # type: ignore
680
+ if data_item_format == "arrow":
681
+ LoaderClass = CollectionAggregationArrowLoader # type: ignore
682
+ else:
683
+ LoaderClass = CollectionAggregationLoader # type: ignore
537
684
  else:
538
- if data_item_format == "arrow":
539
- LoaderClass = CollectionArrowLoader # type: ignore
685
+ if parallel:
686
+ if data_item_format == "arrow":
687
+ LoaderClass = CollectionArrowLoaderParallel
688
+ else:
689
+ LoaderClass = CollectionLoaderParallel # type: ignore
540
690
  else:
541
- LoaderClass = CollectionLoader # type: ignore
542
-
691
+ if data_item_format == "arrow":
692
+ LoaderClass = CollectionArrowLoader # type: ignore
693
+ else:
694
+ LoaderClass = CollectionLoader # type: ignore
695
+
543
696
  loader = LoaderClass(
544
697
  client, collection, incremental=incremental, chunk_size=chunk_size
545
698
  )
546
- if isinstance(loader, (CollectionArrowLoader, CollectionArrowLoaderParallel)):
699
+
700
+ # Set custom query if provided
701
+ if custom_query and hasattr(loader, "set_custom_query"):
702
+ loader.set_custom_query(custom_query)
703
+
704
+ # Load documents based on loader type
705
+ if isinstance(
706
+ loader,
707
+ (
708
+ CollectionArrowLoader,
709
+ CollectionArrowLoaderParallel,
710
+ CollectionAggregationArrowLoader,
711
+ CollectionAggregationArrowLoaderParallel,
712
+ ),
713
+ ):
547
714
  yield from loader.load_documents(
548
715
  limit=limit,
549
716
  filter_=filter_,
ingestr/src/sources.py CHANGED
@@ -73,6 +73,20 @@ class SqlSource:
73
73
 
74
74
  engine_adapter_callback = None
75
75
 
76
+ if uri.startswith("md://") or uri.startswith("motherduck://"):
77
+ parsed_uri = urlparse(uri)
78
+ query_params = parse_qs(parsed_uri.query)
79
+ # Convert md:// URI to duckdb:///md: format
80
+ if parsed_uri.path:
81
+ db_path = parsed_uri.path
82
+ else:
83
+ db_path = ""
84
+
85
+ token = query_params.get("token", [""])[0]
86
+ if not token:
87
+ raise ValueError("Token is required for MotherDuck connection")
88
+ uri = f"duckdb:///md:{db_path}?motherduck_token={token}"
89
+
76
90
  if uri.startswith("mysql://"):
77
91
  uri = uri.replace("mysql://", "mysql+pymysql://")
78
92
 
@@ -409,31 +423,181 @@ class MongoDbSource:
409
423
  return False
410
424
 
411
425
  def dlt_source(self, uri: str, table: str, **kwargs):
412
- table_fields = table_string_to_dataclass(table)
426
+ # Check if this is a custom query format (collection:query)
427
+ if ":" in table:
428
+ collection_name, query_json = table.split(":", 1)
413
429
 
414
- incremental = None
415
- if kwargs.get("incremental_key"):
416
- start_value = kwargs.get("interval_start")
417
- end_value = kwargs.get("interval_end")
430
+ # Parse and validate the query
431
+ try:
432
+ import json
418
433
 
419
- incremental = dlt_incremental(
420
- kwargs.get("incremental_key", ""),
421
- initial_value=start_value,
422
- end_value=end_value,
423
- range_end="closed",
424
- range_start="closed",
434
+ query = json.loads(query_json)
435
+ except json.JSONDecodeError as e:
436
+ raise ValueError(f"Invalid JSON query format: {e}")
437
+
438
+ # Validate that it's a list for aggregation pipeline
439
+ if not isinstance(query, list):
440
+ raise ValueError(
441
+ "Query must be a JSON array representing a MongoDB aggregation pipeline"
442
+ )
443
+
444
+ # Check for incremental load requirements
445
+ incremental = None
446
+ if kwargs.get("incremental_key"):
447
+ start_value = kwargs.get("interval_start")
448
+ end_value = kwargs.get("interval_end")
449
+
450
+ # Validate that incremental key is present in the pipeline
451
+ incremental_key = kwargs.get("incremental_key")
452
+ self._validate_incremental_query(query, str(incremental_key))
453
+
454
+ incremental = dlt_incremental(
455
+ str(incremental_key),
456
+ initial_value=start_value,
457
+ end_value=end_value,
458
+ )
459
+
460
+ # Substitute interval parameters in the query
461
+ query = self._substitute_interval_params(query, kwargs)
462
+
463
+ # Parse collection name to get database and collection
464
+ if "." in collection_name:
465
+ # Handle database.collection format
466
+ table_fields = table_string_to_dataclass(collection_name)
467
+ database = table_fields.dataset
468
+ collection = table_fields.table
469
+ else:
470
+ # Single collection name, use default database
471
+ database = None
472
+ collection = collection_name
473
+
474
+ table_instance = self.table_builder(
475
+ connection_url=uri,
476
+ database=database,
477
+ collection=collection,
478
+ parallel=False,
479
+ incremental=incremental,
480
+ custom_query=query,
481
+ )
482
+ table_instance.max_table_nesting = 1
483
+ return table_instance
484
+ else:
485
+ # Default behavior for simple collection names
486
+ table_fields = table_string_to_dataclass(table)
487
+
488
+ incremental = None
489
+ if kwargs.get("incremental_key"):
490
+ start_value = kwargs.get("interval_start")
491
+ end_value = kwargs.get("interval_end")
492
+
493
+ incremental = dlt_incremental(
494
+ kwargs.get("incremental_key", ""),
495
+ initial_value=start_value,
496
+ end_value=end_value,
497
+ )
498
+
499
+ table_instance = self.table_builder(
500
+ connection_url=uri,
501
+ database=table_fields.dataset,
502
+ collection=table_fields.table,
503
+ parallel=False,
504
+ incremental=incremental,
505
+ )
506
+ table_instance.max_table_nesting = 1
507
+
508
+ return table_instance
509
+
510
+ def _validate_incremental_query(self, query: list, incremental_key: str):
511
+ """Validate that incremental key is projected in the aggregation pipeline"""
512
+ # Check if there's a $project stage and if incremental_key is included
513
+ has_project = False
514
+ incremental_key_projected = False
515
+
516
+ for stage in query:
517
+ if "$project" in stage:
518
+ has_project = True
519
+ project_stage = stage["$project"]
520
+ if isinstance(project_stage, dict):
521
+ # Check if incremental_key is explicitly included
522
+ if incremental_key in project_stage:
523
+ if project_stage[incremental_key] not in [0, False]:
524
+ incremental_key_projected = True
525
+ # If there are only inclusions (1 or True values) and incremental_key is not included
526
+ elif any(v in [1, True] for v in project_stage.values()):
527
+ # This is an inclusion projection, incremental_key must be explicitly included
528
+ incremental_key_projected = False
529
+ # If there are only exclusions (0 or False values) and incremental_key is not excluded
530
+ elif all(
531
+ v in [0, False]
532
+ for v in project_stage.values()
533
+ if v in [0, False, 1, True]
534
+ ):
535
+ # This is an exclusion projection, incremental_key is included by default
536
+ if incremental_key not in project_stage:
537
+ incremental_key_projected = True
538
+ else:
539
+ incremental_key_projected = project_stage[
540
+ incremental_key
541
+ ] not in [0, False]
542
+ else:
543
+ # Mixed or unclear projection, assume incremental_key needs to be explicit
544
+ incremental_key_projected = False
545
+
546
+ # If there's a $project stage but incremental_key is not projected, raise error
547
+ if has_project and not incremental_key_projected:
548
+ raise ValueError(
549
+ f"Incremental key '{incremental_key}' must be included in the projected fields of the aggregation pipeline"
425
550
  )
426
551
 
427
- table_instance = self.table_builder(
428
- connection_url=uri,
429
- database=table_fields.dataset,
430
- collection=table_fields.table,
431
- parallel=False,
432
- incremental=incremental,
433
- )
434
- table_instance.max_table_nesting = 1
552
+ def _substitute_interval_params(self, query: list, kwargs: dict):
553
+ """Substitute :interval_start and :interval_end placeholders with actual datetime values"""
554
+ from dlt.common.time import ensure_pendulum_datetime
435
555
 
436
- return table_instance
556
+ # Get interval values and convert them to datetime objects
557
+ interval_start = kwargs.get("interval_start")
558
+ interval_end = kwargs.get("interval_end")
559
+
560
+ # Convert string dates to datetime objects if needed
561
+ if interval_start is not None:
562
+ if isinstance(interval_start, str):
563
+ pendulum_dt = ensure_pendulum_datetime(interval_start)
564
+ interval_start = (
565
+ pendulum_dt.to_datetime()
566
+ if hasattr(pendulum_dt, "to_datetime")
567
+ else pendulum_dt
568
+ )
569
+ elif hasattr(interval_start, "to_datetime"):
570
+ interval_start = interval_start.to_datetime()
571
+
572
+ if interval_end is not None:
573
+ if isinstance(interval_end, str):
574
+ pendulum_dt = ensure_pendulum_datetime(interval_end)
575
+ interval_end = (
576
+ pendulum_dt.to_datetime()
577
+ if hasattr(pendulum_dt, "to_datetime")
578
+ else pendulum_dt
579
+ )
580
+ elif hasattr(interval_end, "to_datetime"):
581
+ interval_end = interval_end.to_datetime()
582
+
583
+ # Deep copy the query and replace placeholders with actual datetime objects
584
+ def replace_placeholders(obj):
585
+ if isinstance(obj, dict):
586
+ result = {}
587
+ for key, value in obj.items():
588
+ if value == ":interval_start" and interval_start is not None:
589
+ result[key] = interval_start
590
+ elif value == ":interval_end" and interval_end is not None:
591
+ result[key] = interval_end
592
+ else:
593
+ result[key] = replace_placeholders(value)
594
+ return result
595
+ elif isinstance(obj, list):
596
+ return [replace_placeholders(item) for item in obj]
597
+ else:
598
+ return obj
599
+
600
+ return replace_placeholders(query)
437
601
 
438
602
 
439
603
  class LocalCsvSource:
@@ -961,7 +1125,7 @@ class SlackSource:
961
1125
 
962
1126
  class HubspotSource:
963
1127
  def handles_incrementality(self) -> bool:
964
- return True
1128
+ return False
965
1129
 
966
1130
  # hubspot://?api_key=<api_key>
967
1131
  def dlt_source(self, uri: str, table: str, **kwargs):
@@ -2528,6 +2692,18 @@ class FreshdeskSource:
2528
2692
  if api_key is None:
2529
2693
  raise MissingValueError("api_key", "Freshdesk")
2530
2694
 
2695
+ start_date = kwargs.get("interval_start")
2696
+ if start_date is not None:
2697
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
2698
+ else:
2699
+ start_date = ensure_pendulum_datetime("2022-01-01T00:00:00Z")
2700
+
2701
+ end_date = kwargs.get("interval_end")
2702
+ if end_date is not None:
2703
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
2704
+ else:
2705
+ end_date = None
2706
+
2531
2707
  if table not in [
2532
2708
  "agents",
2533
2709
  "companies",
@@ -2541,7 +2717,10 @@ class FreshdeskSource:
2541
2717
  from ingestr.src.freshdesk import freshdesk_source
2542
2718
 
2543
2719
  return freshdesk_source(
2544
- api_secret_key=api_key[0], domain=domain
2720
+ api_secret_key=api_key[0],
2721
+ domain=domain,
2722
+ start_date=start_date,
2723
+ end_date=end_date,
2545
2724
  ).with_resources(table)
2546
2725
 
2547
2726
 
@@ -2684,7 +2863,7 @@ class ElasticsearchSource:
2684
2863
 
2685
2864
  class AttioSource:
2686
2865
  def handles_incrementality(self) -> bool:
2687
- return True
2866
+ return False
2688
2867
 
2689
2868
  def dlt_source(self, uri: str, table: str, **kwargs):
2690
2869
  parsed_uri = urlparse(uri)
@@ -3056,7 +3235,7 @@ class InfluxDBSource:
3056
3235
 
3057
3236
  secure = params.get("secure", ["true"])[0].lower() != "false"
3058
3237
  scheme = "https" if secure else "http"
3059
-
3238
+
3060
3239
  if port:
3061
3240
  host_url = f"{scheme}://{host}:{port}"
3062
3241
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.78
3
+ Version: 0.13.79
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -355,6 +355,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
355
355
  <td>✅</td>
356
356
  <td>❌</td>
357
357
  </tr>
358
+ <tr>
359
+ <td>MotherDuck</td>
360
+ <td>✅</td>
361
+ <td>✅</td>
362
+ </tr>
358
363
  <tr>
359
364
  <td>MySQL</td>
360
365
  <td>✅</td>
@@ -1,17 +1,17 @@
1
1
  ingestr/conftest.py,sha256=OE2yxeTCosS9CUFVuqNypm-2ftYvVBeeq7egm3878cI,1981
2
- ingestr/main.py,sha256=QsNVrz5_NgRUkvfExnd-2E02TGmWivPuop5hYinVAjM,26513
2
+ ingestr/main.py,sha256=qoWHNcHh0-xVnyQxbQ-SKuTxPb1RNV3ENkCpqO7CLrk,26694
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
4
  ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
5
- ingestr/src/buildinfo.py,sha256=cARFQnpIzB5xD3JEaIPIkee7dO80kbLs4M_XypNnwSI,21
6
- ingestr/src/destinations.py,sha256=ivTPio0zzqLRx22i597pxZHMNClz-XvYSyCaCPuGd8g,22248
5
+ ingestr/src/buildinfo.py,sha256=yE0cfxWae8TNJJLYcRmNexeK769vtdz_-vJGzcROgwE,21
6
+ ingestr/src/destinations.py,sha256=M2Yni6wiWcrvZ8EPJemidqxN156l0rehgCc7xuil7mo,22840
7
7
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
8
- ingestr/src/factory.py,sha256=q_rSi4gYMfxnGvzhytPRAgC08N40nqDISvXwl7i-E_M,6655
8
+ ingestr/src/factory.py,sha256=rF5Ry4o4t8KulSPBtrd7ZKCI_0TH1DAetG0zs9H7oik,6792
9
9
  ingestr/src/filters.py,sha256=LLecXe9QkLFkFLUZ92OXNdcANr1a8edDxrflc2ko_KA,1452
10
10
  ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,732
11
11
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
12
12
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
13
13
  ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
14
- ingestr/src/sources.py,sha256=1A1tZKA1NUQnHdgvGPKHuRG5o8lNuCe7bIxB0n73eJw,107635
14
+ ingestr/src/sources.py,sha256=qZz35cdO-nO9CZsdOJ8Ni56wclNfbGQuGj4nsoHpFxE,115678
15
15
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
16
16
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
17
17
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -41,7 +41,7 @@ ingestr/src/clickup/helpers.py,sha256=RzDKMUAHccuDhocIQ2ToBXfCERo8CBJqA3t-IPltBC
41
41
  ingestr/src/collector/spinner.py,sha256=_ZUqF5MI43hVIULdjF5s5mrAZbhEFXaiWirQmrv3Yk4,1201
42
42
  ingestr/src/dynamodb/__init__.py,sha256=swhxkeYBbJ35jn1IghCtvYWT2BM33KynVCh_oR4z28A,2264
43
43
  ingestr/src/elasticsearch/__init__.py,sha256=m-q93HgUmTwGDUwHOjHawstWL06TC3WIX3H05szybrY,2556
44
- ingestr/src/facebook_ads/__init__.py,sha256=Rchn-nH5mAOWW7OeYMCy_VS8dAoqfYY4t0YzWDSeN5k,9751
44
+ ingestr/src/facebook_ads/__init__.py,sha256=_9929DYzcq5iLt-l3DmJ4VBZwmoEwgyPZbPstH0ySmI,9725
45
45
  ingestr/src/facebook_ads/exceptions.py,sha256=4Nlbc0Mv3i5g-9AoyT-n1PIa8IDi3VCTfEAzholx4Wc,115
46
46
  ingestr/src/facebook_ads/helpers.py,sha256=NshS21can1xhRKQzg_o-c6qSxWoC3NnE3FwgJxUnygE,8239
47
47
  ingestr/src/facebook_ads/settings.py,sha256=Bsic8RcmH-NfEZ7r_NGospTCmwISK9XaMT5y2NZirtg,4938
@@ -51,11 +51,11 @@ ingestr/src/filesystem/helpers.py,sha256=bg0muSHZr3hMa8H4jN2-LGWzI-SUoKlQNiWJ74-
51
51
  ingestr/src/filesystem/readers.py,sha256=a0fKkaRpnAOGsXI3EBNYZa7x6tlmAOsgRzb883StY30,3987
52
52
  ingestr/src/frankfurter/__init__.py,sha256=oVi4BiOxPRyckEVrBNunyMAHulPyMgyGRwBbhn-Xz6M,4987
53
53
  ingestr/src/frankfurter/helpers.py,sha256=SyrkRTDqvKdQxRHTV5kcSeVG3FEnaK5zxHyNyqtumZ0,1445
54
- ingestr/src/freshdesk/__init__.py,sha256=uFQW_cJyymxtHQiYb_xjzZAklc487L0n9GkgHgC7yAI,2618
55
- ingestr/src/freshdesk/freshdesk_client.py,sha256=3z5Yc008ADzRcJWtNc00PwjkLzG-RMI8jVIOOyYA-Rw,4088
54
+ ingestr/src/freshdesk/__init__.py,sha256=ukyorgCNsW_snzsYBDsr3Q0WB8f-to9Fk0enqHHFQlk,3087
55
+ ingestr/src/freshdesk/freshdesk_client.py,sha256=1nFf0K4MQ0KZbWwk4xSbYHaykVqmPLfN39miOFDpWVc,4385
56
56
  ingestr/src/freshdesk/settings.py,sha256=0Wr_OMnUZcTlry7BmALssLxD2yh686JW4moLNv12Jnw,409
57
- ingestr/src/github/__init__.py,sha256=R71y33KqzxDTvCLSGj2H2EztfGqsWGR9ZgcaurC1-A4,7220
58
- ingestr/src/github/helpers.py,sha256=hge8orylwiScRcMftlv4oSZ6ORvVANwHCPAGkg95FtI,6758
57
+ ingestr/src/github/__init__.py,sha256=C7b5j6CrxmTItS4tyDa3OYzdAw5c__xboOtoEJYe3wQ,7217
58
+ ingestr/src/github/helpers.py,sha256=rpv_3HzuOl4PQ-FUeA66pev-pgze9SaE8RUHIPYfZ_A,6759
59
59
  ingestr/src/github/queries.py,sha256=W34C02jUEdjFmOE7f7u9xvYyBNDMfVZAu0JIRZI2mkU,2302
60
60
  ingestr/src/github/settings.py,sha256=N5ahWrDIQ_4IWV9i-hTXxyYduqY9Ym2BTwqsWxcDdJ8,258
61
61
  ingestr/src/google_ads/__init__.py,sha256=bH0TtnRWcOUESezpvoA7VEUHAq_0ITGQeX4GGVBfl1I,3725
@@ -75,7 +75,7 @@ ingestr/src/gorgias/helpers.py,sha256=DamuijnvhGY9hysQO4txrVMf4izkGbh5qfBKImdOIN
75
75
  ingestr/src/hubspot/__init__.py,sha256=wqHefhc_YRI5dNFCcpvH-UUilNThE49sbGouSBiHYsw,11776
76
76
  ingestr/src/hubspot/helpers.py,sha256=k2b-lhxqBNKHoOSHoHegFSsk8xxjjGA0I04V0XyX2b4,7883
77
77
  ingestr/src/hubspot/settings.py,sha256=i73MkSiJfRLMFLfiJgYdhp-rhymHTfoqFzZ4uOJdFJM,2456
78
- ingestr/src/influxdb/__init__.py,sha256=sj_K4ShXECp6cW4xVVv2kCwQCFtTYD0dC9LOAEqFoVI,1289
78
+ ingestr/src/influxdb/__init__.py,sha256=cYsGnDPNHRTe9pp14ogDQgPTCI9TOdyJm1MaNuQLHdk,1290
79
79
  ingestr/src/influxdb/client.py,sha256=hCxSNREAWWEvvAV3RQbKaWp2-e_7EE8xmVRjTwLFEFo,1230
80
80
  ingestr/src/isoc_pulse/__init__.py,sha256=9b4eN4faatpiwTuRNPuYcEt1hEFDEjua9XhfakUigBk,4648
81
81
  ingestr/src/kafka/__init__.py,sha256=QUHsGmdv5_E-3z0GDHXvbk39puwuGDBsyYSDhvbA89E,3595
@@ -92,8 +92,8 @@ ingestr/src/linkedin_ads/dimension_time_enum.py,sha256=EmHRdkFyTAfo4chGjThrwqffW
92
92
  ingestr/src/linkedin_ads/helpers.py,sha256=eUWudRVlXl4kqIhfXQ1eVsUpZwJn7UFqKSpnbLfxzds,4498
93
93
  ingestr/src/mixpanel/__init__.py,sha256=s1QtqMP0BTGW6YtdCabJFWj7lEn7KujzELwGpBOQgfs,1796
94
94
  ingestr/src/mixpanel/client.py,sha256=c_reouegOVYBOwHLfgYFwpmkba0Sxro1Zkml07NCYf0,3602
95
- ingestr/src/mongodb/__init__.py,sha256=T-RYPS_skl_2gNVfYWWXan2bVQYmm0bFBcCCqG5ejvg,7275
96
- ingestr/src/mongodb/helpers.py,sha256=8pjNYZu4k2rkR9dItTMAnPaRdF1kroqLYX9FZ34RTqo,24491
95
+ ingestr/src/mongodb/__init__.py,sha256=5KNdR2mxJoHSOU1pt-FIJNg9HT4aHPwl6mI31xPBQLA,7487
96
+ ingestr/src/mongodb/helpers.py,sha256=VMGKkSN6FIQ4l-4TUqoc-Ou7r52_zPXuLF33ZN23B_I,30881
97
97
  ingestr/src/notion/__init__.py,sha256=36wUui8finbc85ObkRMq8boMraXMUehdABN_AMe_hzA,1834
98
98
  ingestr/src/notion/settings.py,sha256=MwQVZViJtnvOegfjXYc_pJ50oUYgSRPgwqu7TvpeMOA,82
99
99
  ingestr/src/notion/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -151,8 +151,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
151
151
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
152
152
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
153
153
  ingestr/tests/unit/test_smartsheets.py,sha256=eiC2CCO4iNJcuN36ONvqmEDryCA1bA1REpayHpu42lk,5058
154
- ingestr-0.13.78.dist-info/METADATA,sha256=Q7ofO2TRuTOUb4fhZvyr_kejvaOM2OwCrq3FnCLEk6U,15093
155
- ingestr-0.13.78.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
156
- ingestr-0.13.78.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
157
- ingestr-0.13.78.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
158
- ingestr-0.13.78.dist-info/RECORD,,
154
+ ingestr-0.13.79.dist-info/METADATA,sha256=5dl0NFB3Ach1_lFtE4xOJpud_chn_w0qvepZnnMjRzo,15182
155
+ ingestr-0.13.79.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
156
+ ingestr-0.13.79.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
157
+ ingestr-0.13.79.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
158
+ ingestr-0.13.79.dist-info/RECORD,,