ingestr 0.13.78__py3-none-any.whl → 0.13.80__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +10 -3
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/destinations.py +18 -0
- ingestr/src/facebook_ads/__init__.py +6 -2
- ingestr/src/facebook_ads/helpers.py +1 -1
- ingestr/src/factory.py +5 -0
- ingestr/src/freshdesk/__init__.py +23 -8
- ingestr/src/freshdesk/freshdesk_client.py +16 -5
- ingestr/src/github/__init__.py +5 -3
- ingestr/src/github/helpers.py +1 -0
- ingestr/src/influxdb/__init__.py +1 -0
- ingestr/src/linear/__init__.py +61 -43
- ingestr/src/linear/helpers.py +19 -36
- ingestr/src/mongodb/__init__.py +3 -0
- ingestr/src/mongodb/helpers.py +178 -11
- ingestr/src/sources.py +311 -25
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/METADATA +6 -1
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/RECORD +21 -21
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/WHEEL +0 -0
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/mongodb/helpers.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Mongo database source helpers"""
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
from itertools import islice
|
|
5
4
|
from typing import (
|
|
6
5
|
TYPE_CHECKING,
|
|
@@ -209,7 +208,7 @@ class CollectionLoader:
|
|
|
209
208
|
if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
|
|
210
209
|
yield dlt.mark.with_hints(
|
|
211
210
|
res,
|
|
212
|
-
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}
|
|
211
|
+
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
|
|
213
212
|
)
|
|
214
213
|
else:
|
|
215
214
|
yield res
|
|
@@ -472,6 +471,145 @@ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
|
|
|
472
471
|
yield convert_arrow_columns(table)
|
|
473
472
|
|
|
474
473
|
|
|
474
|
+
class CollectionAggregationLoader(CollectionLoader):
|
|
475
|
+
"""
|
|
476
|
+
MongoDB collection loader that uses aggregation pipelines instead of find queries.
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
def __init__(
|
|
480
|
+
self,
|
|
481
|
+
client: TMongoClient,
|
|
482
|
+
collection: TCollection,
|
|
483
|
+
chunk_size: int,
|
|
484
|
+
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
485
|
+
) -> None:
|
|
486
|
+
super().__init__(client, collection, chunk_size, incremental)
|
|
487
|
+
self.custom_query: Optional[List[Dict[str, Any]]] = None
|
|
488
|
+
|
|
489
|
+
def set_custom_query(self, query: List[Dict[str, Any]]):
|
|
490
|
+
"""Set the custom aggregation pipeline query"""
|
|
491
|
+
self.custom_query = query
|
|
492
|
+
|
|
493
|
+
def load_documents(
|
|
494
|
+
self,
|
|
495
|
+
filter_: Dict[str, Any],
|
|
496
|
+
limit: Optional[int] = None,
|
|
497
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
498
|
+
) -> Iterator[TDataItem]:
|
|
499
|
+
"""Load documents using aggregation pipeline"""
|
|
500
|
+
if not self.custom_query:
|
|
501
|
+
# Fallback to parent method if no custom query
|
|
502
|
+
yield from super().load_documents(filter_, limit, projection)
|
|
503
|
+
return
|
|
504
|
+
|
|
505
|
+
# Build aggregation pipeline
|
|
506
|
+
pipeline = list(self.custom_query) # Copy the query
|
|
507
|
+
|
|
508
|
+
# For custom queries, we assume incremental filtering is already handled
|
|
509
|
+
# via interval placeholders (:interval_start, :interval_end) in the query itself.
|
|
510
|
+
# We don't add additional incremental filtering to avoid conflicts.
|
|
511
|
+
|
|
512
|
+
# Add additional filter if provided
|
|
513
|
+
if filter_:
|
|
514
|
+
filter_match = {"$match": filter_}
|
|
515
|
+
pipeline.insert(0, filter_match)
|
|
516
|
+
|
|
517
|
+
# Add limit if specified
|
|
518
|
+
if limit and limit > 0:
|
|
519
|
+
pipeline.append({"$limit": limit})
|
|
520
|
+
|
|
521
|
+
print("pipeline", pipeline)
|
|
522
|
+
# Execute aggregation
|
|
523
|
+
cursor = self.collection.aggregate(pipeline, allowDiskUse=True)
|
|
524
|
+
|
|
525
|
+
# Process results in chunks
|
|
526
|
+
while docs_slice := list(islice(cursor, self.chunk_size)):
|
|
527
|
+
res = map_nested_in_place(convert_mongo_objs, docs_slice)
|
|
528
|
+
print("res", res)
|
|
529
|
+
if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
|
|
530
|
+
yield dlt.mark.with_hints(
|
|
531
|
+
res,
|
|
532
|
+
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
|
|
533
|
+
)
|
|
534
|
+
else:
|
|
535
|
+
yield res
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
|
|
539
|
+
"""
|
|
540
|
+
MongoDB collection parallel loader that uses aggregation pipelines.
|
|
541
|
+
Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
|
|
542
|
+
Falls back to sequential loading.
|
|
543
|
+
"""
|
|
544
|
+
|
|
545
|
+
def load_documents(
|
|
546
|
+
self,
|
|
547
|
+
filter_: Dict[str, Any],
|
|
548
|
+
limit: Optional[int] = None,
|
|
549
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
550
|
+
) -> Iterator[TDataItem]:
|
|
551
|
+
"""Load documents using aggregation pipeline (sequential only)"""
|
|
552
|
+
logger.warning(
|
|
553
|
+
"Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
|
|
554
|
+
)
|
|
555
|
+
yield from super().load_documents(filter_, limit, projection)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
class CollectionAggregationArrowLoader(CollectionAggregationLoader):
|
|
559
|
+
"""
|
|
560
|
+
MongoDB collection aggregation loader that uses Apache Arrow for data processing.
|
|
561
|
+
"""
|
|
562
|
+
|
|
563
|
+
def load_documents(
|
|
564
|
+
self,
|
|
565
|
+
filter_: Dict[str, Any],
|
|
566
|
+
limit: Optional[int] = None,
|
|
567
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
568
|
+
pymongoarrow_schema: Any = None,
|
|
569
|
+
) -> Iterator[Any]:
|
|
570
|
+
"""Load documents using aggregation pipeline with Arrow format"""
|
|
571
|
+
logger.warning(
|
|
572
|
+
"Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
# Load documents normally and convert to arrow format
|
|
576
|
+
for batch in super().load_documents(filter_, limit, projection):
|
|
577
|
+
if batch: # Only process non-empty batches
|
|
578
|
+
try:
|
|
579
|
+
from dlt.common.libs.pyarrow import pyarrow
|
|
580
|
+
|
|
581
|
+
# Convert dict batch to arrow table
|
|
582
|
+
table = pyarrow.Table.from_pylist(batch)
|
|
583
|
+
yield convert_arrow_columns(table)
|
|
584
|
+
except ImportError:
|
|
585
|
+
logger.warning(
|
|
586
|
+
"PyArrow not available, falling back to object format"
|
|
587
|
+
)
|
|
588
|
+
yield batch
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
|
|
592
|
+
"""
|
|
593
|
+
MongoDB collection parallel aggregation loader with Arrow support.
|
|
594
|
+
Falls back to sequential loading.
|
|
595
|
+
"""
|
|
596
|
+
|
|
597
|
+
def load_documents(
|
|
598
|
+
self,
|
|
599
|
+
filter_: Dict[str, Any],
|
|
600
|
+
limit: Optional[int] = None,
|
|
601
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
602
|
+
pymongoarrow_schema: Any = None,
|
|
603
|
+
) -> Iterator[TDataItem]:
|
|
604
|
+
"""Load documents using aggregation pipeline with Arrow format (sequential only)"""
|
|
605
|
+
logger.warning(
|
|
606
|
+
"Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
|
|
607
|
+
)
|
|
608
|
+
yield from super().load_documents(
|
|
609
|
+
filter_, limit, projection, pymongoarrow_schema
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
|
|
475
613
|
def collection_documents(
|
|
476
614
|
client: TMongoClient,
|
|
477
615
|
collection: TCollection,
|
|
@@ -483,6 +621,7 @@ def collection_documents(
|
|
|
483
621
|
limit: Optional[int] = None,
|
|
484
622
|
chunk_size: Optional[int] = 10000,
|
|
485
623
|
data_item_format: Optional[TDataItemFormat] = "object",
|
|
624
|
+
custom_query: Optional[List[Dict[str, Any]]] = None,
|
|
486
625
|
) -> Iterator[TDataItem]:
|
|
487
626
|
"""
|
|
488
627
|
A DLT source which loads data from a Mongo database using PyMongo.
|
|
@@ -507,6 +646,7 @@ def collection_documents(
|
|
|
507
646
|
Supported formats:
|
|
508
647
|
object - Python objects (dicts, lists).
|
|
509
648
|
arrow - Apache Arrow tables.
|
|
649
|
+
custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
|
|
510
650
|
|
|
511
651
|
Returns:
|
|
512
652
|
Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
|
|
@@ -529,21 +669,48 @@ def collection_documents(
|
|
|
529
669
|
"create a projection to select fields, `projection` will be ignored."
|
|
530
670
|
)
|
|
531
671
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
672
|
+
# If custom query is provided, use aggregation loaders
|
|
673
|
+
if custom_query:
|
|
674
|
+
if parallel:
|
|
675
|
+
if data_item_format == "arrow":
|
|
676
|
+
LoaderClass = CollectionAggregationArrowLoaderParallel
|
|
677
|
+
else:
|
|
678
|
+
LoaderClass = CollectionAggregationLoaderParallel # type: ignore
|
|
535
679
|
else:
|
|
536
|
-
|
|
680
|
+
if data_item_format == "arrow":
|
|
681
|
+
LoaderClass = CollectionAggregationArrowLoader # type: ignore
|
|
682
|
+
else:
|
|
683
|
+
LoaderClass = CollectionAggregationLoader # type: ignore
|
|
537
684
|
else:
|
|
538
|
-
if
|
|
539
|
-
|
|
685
|
+
if parallel:
|
|
686
|
+
if data_item_format == "arrow":
|
|
687
|
+
LoaderClass = CollectionArrowLoaderParallel
|
|
688
|
+
else:
|
|
689
|
+
LoaderClass = CollectionLoaderParallel # type: ignore
|
|
540
690
|
else:
|
|
541
|
-
|
|
542
|
-
|
|
691
|
+
if data_item_format == "arrow":
|
|
692
|
+
LoaderClass = CollectionArrowLoader # type: ignore
|
|
693
|
+
else:
|
|
694
|
+
LoaderClass = CollectionLoader # type: ignore
|
|
695
|
+
|
|
543
696
|
loader = LoaderClass(
|
|
544
697
|
client, collection, incremental=incremental, chunk_size=chunk_size
|
|
545
698
|
)
|
|
546
|
-
|
|
699
|
+
|
|
700
|
+
# Set custom query if provided
|
|
701
|
+
if custom_query and hasattr(loader, "set_custom_query"):
|
|
702
|
+
loader.set_custom_query(custom_query)
|
|
703
|
+
|
|
704
|
+
# Load documents based on loader type
|
|
705
|
+
if isinstance(
|
|
706
|
+
loader,
|
|
707
|
+
(
|
|
708
|
+
CollectionArrowLoader,
|
|
709
|
+
CollectionArrowLoaderParallel,
|
|
710
|
+
CollectionAggregationArrowLoader,
|
|
711
|
+
CollectionAggregationArrowLoaderParallel,
|
|
712
|
+
),
|
|
713
|
+
):
|
|
547
714
|
yield from loader.load_documents(
|
|
548
715
|
limit=limit,
|
|
549
716
|
filter_=filter_,
|