ingestr 0.13.94__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -1,157 +1,157 @@
1
- from typing import Any, Iterator, Optional
2
-
3
- import dlt
4
- from dlt.common.pendulum import pendulum
5
- from dlt.common.time import ensure_pendulum_datetime
6
- from dlt.common.typing import TAnyDateTime
7
-
8
- from ingestr.src.frankfurter.helpers import get_path_with_retry
9
-
10
-
11
- @dlt.source(
12
- name="frankfurter",
13
- max_table_nesting=0,
14
- )
15
- def frankfurter_source(
16
- start_date: TAnyDateTime,
17
- end_date: TAnyDateTime | None,
18
- base_currency: str,
19
- ) -> Any:
20
- """
21
- A dlt source for the frankfurter.dev API. It groups several resources (in this case frankfurter.dev API endpoints) containing
22
- various types of data: currencies, latest rates, historical rates.
23
- """
24
-
25
- @dlt.resource(
26
- write_disposition="replace",
27
- )
28
- def currencies() -> Iterator[dict]:
29
- """
30
- Yields each currency as a separate row with two columns: currency_code and currency_name.
31
- """
32
- # Retrieve the list of currencies from the API
33
- currencies_data = get_path_with_retry("currencies")
34
-
35
- for currency_code, currency_name in currencies_data.items():
36
- yield {"currency_code": currency_code, "currency_name": currency_name}
37
-
38
- @dlt.resource(
39
- write_disposition="merge",
40
- columns={
41
- "date": {"data_type": "text"},
42
- "currency_code": {"data_type": "text"},
43
- "rate": {"data_type": "double"},
44
- "base_currency": {"data_type": "text"},
45
- },
46
- primary_key=["date", "currency_code", "base_currency"],
47
- )
48
- def latest(base_currency: Optional[str] = "") -> Iterator[dict]:
49
- """
50
- Fetches the latest exchange rates and yields them as rows.
51
- """
52
- # Base URL
53
- url = "latest?"
54
-
55
- if base_currency:
56
- url += f"base={base_currency}"
57
-
58
- # Fetch data
59
- data = get_path_with_retry(url)
60
-
61
- # Extract rates and base currency
62
- rates = data["rates"]
63
- date = pendulum.parse(data["date"])
64
-
65
- # Add the base currency with a rate of 1.0
66
- yield {
67
- "date": date,
68
- "currency_code": base_currency,
69
- "rate": 1.0,
70
- "base_currency": base_currency,
71
- }
72
-
73
- # Add all currencies and their rates
74
- for currency_code, rate in rates.items():
75
- yield {
76
- "date": date,
77
- "currency_code": currency_code,
78
- "rate": rate,
79
- "base_currency": base_currency,
80
- }
81
-
82
- @dlt.resource(
83
- write_disposition="merge",
84
- columns={
85
- "date": {"data_type": "text"},
86
- "currency_code": {"data_type": "text"},
87
- "rate": {"data_type": "double"},
88
- "base_currency": {"data_type": "text"},
89
- },
90
- primary_key=("date", "currency_code", "base_currency"),
91
- )
92
- def exchange_rates(
93
- date_time=dlt.sources.incremental(
94
- "date",
95
- initial_value=start_date,
96
- end_value=end_date,
97
- range_start="closed",
98
- range_end="closed",
99
- ),
100
- ) -> Iterator[dict]:
101
- """
102
- Fetches exchange rates for a specified date range.
103
- If only start_date is provided, fetches data until now.
104
- If both start_date and end_date are provided, fetches data for each day in the range.
105
- """
106
- if date_time.last_value is not None:
107
- start_date = date_time.last_value
108
- else:
109
- start_date = start_date
110
-
111
- if date_time.end_value is not None:
112
- end_date = date_time.end_value
113
- else:
114
- end_date = pendulum.now()
115
-
116
- # Ensure start_date.last_value is a pendulum.DateTime object
117
- start_date_obj = ensure_pendulum_datetime(start_date) # type: ignore
118
- start_date_str = start_date_obj.format("YYYY-MM-DD")
119
-
120
- # Ensure end_date is a pendulum.DateTime object
121
- end_date_obj = ensure_pendulum_datetime(end_date)
122
- end_date_str = end_date_obj.format("YYYY-MM-DD")
123
-
124
- # Compose the URL
125
- url = f"{start_date_str}..{end_date_str}?"
126
-
127
- if base_currency:
128
- url += f"base={base_currency}"
129
-
130
- # Fetch data from the API
131
- data = get_path_with_retry(url)
132
-
133
- # Extract base currency and rates from the API response
134
- rates = data["rates"]
135
-
136
- # Iterate over the rates dictionary (one entry per date)
137
- for date, daily_rates in rates.items():
138
- formatted_date = pendulum.parse(date)
139
-
140
- # Add the base currency with a rate of 1.0
141
- yield {
142
- "date": formatted_date,
143
- "currency_code": base_currency,
144
- "rate": 1.0,
145
- "base_currency": base_currency,
146
- }
147
-
148
- # Add all other currencies and their rates
149
- for currency_code, rate in daily_rates.items():
150
- yield {
151
- "date": formatted_date,
152
- "currency_code": currency_code,
153
- "rate": rate,
154
- "base_currency": base_currency,
155
- }
156
-
157
- return currencies, latest, exchange_rates
1
+ from typing import Any, Iterator, Optional
2
+
3
+ import dlt
4
+ from dlt.common.pendulum import pendulum
5
+ from dlt.common.time import ensure_pendulum_datetime
6
+ from dlt.common.typing import TAnyDateTime
7
+
8
+ from ingestr.src.frankfurter.helpers import get_path_with_retry
9
+
10
+
11
+ @dlt.source(
12
+ name="frankfurter",
13
+ max_table_nesting=0,
14
+ )
15
+ def frankfurter_source(
16
+ start_date: TAnyDateTime,
17
+ end_date: TAnyDateTime | None,
18
+ base_currency: str,
19
+ ) -> Any:
20
+ """
21
+ A dlt source for the frankfurter.dev API. It groups several resources (in this case frankfurter.dev API endpoints) containing
22
+ various types of data: currencies, latest rates, historical rates.
23
+ """
24
+
25
+ @dlt.resource(
26
+ write_disposition="replace",
27
+ )
28
+ def currencies() -> Iterator[dict]:
29
+ """
30
+ Yields each currency as a separate row with two columns: currency_code and currency_name.
31
+ """
32
+ # Retrieve the list of currencies from the API
33
+ currencies_data = get_path_with_retry("currencies")
34
+
35
+ for currency_code, currency_name in currencies_data.items():
36
+ yield {"currency_code": currency_code, "currency_name": currency_name}
37
+
38
+ @dlt.resource(
39
+ write_disposition="merge",
40
+ columns={
41
+ "date": {"data_type": "text"},
42
+ "currency_code": {"data_type": "text"},
43
+ "rate": {"data_type": "double"},
44
+ "base_currency": {"data_type": "text"},
45
+ },
46
+ primary_key=["date", "currency_code", "base_currency"],
47
+ )
48
+ def latest(base_currency: Optional[str] = "") -> Iterator[dict]:
49
+ """
50
+ Fetches the latest exchange rates and yields them as rows.
51
+ """
52
+ # Base URL
53
+ url = "latest?"
54
+
55
+ if base_currency:
56
+ url += f"base={base_currency}"
57
+
58
+ # Fetch data
59
+ data = get_path_with_retry(url)
60
+
61
+ # Extract rates and base currency
62
+ rates = data["rates"]
63
+ date = pendulum.parse(data["date"])
64
+
65
+ # Add the base currency with a rate of 1.0
66
+ yield {
67
+ "date": date,
68
+ "currency_code": base_currency,
69
+ "rate": 1.0,
70
+ "base_currency": base_currency,
71
+ }
72
+
73
+ # Add all currencies and their rates
74
+ for currency_code, rate in rates.items():
75
+ yield {
76
+ "date": date,
77
+ "currency_code": currency_code,
78
+ "rate": rate,
79
+ "base_currency": base_currency,
80
+ }
81
+
82
+ @dlt.resource(
83
+ write_disposition="merge",
84
+ columns={
85
+ "date": {"data_type": "text"},
86
+ "currency_code": {"data_type": "text"},
87
+ "rate": {"data_type": "double"},
88
+ "base_currency": {"data_type": "text"},
89
+ },
90
+ primary_key=("date", "currency_code", "base_currency"),
91
+ )
92
+ def exchange_rates(
93
+ date_time=dlt.sources.incremental(
94
+ "date",
95
+ initial_value=start_date,
96
+ end_value=end_date,
97
+ range_start="closed",
98
+ range_end="closed",
99
+ ),
100
+ ) -> Iterator[dict]:
101
+ """
102
+ Fetches exchange rates for a specified date range.
103
+ If only start_date is provided, fetches data until now.
104
+ If both start_date and end_date are provided, fetches data for each day in the range.
105
+ """
106
+ if date_time.last_value is not None:
107
+ start_date = date_time.last_value
108
+ else:
109
+ start_date = start_date
110
+
111
+ if date_time.end_value is not None:
112
+ end_date = date_time.end_value
113
+ else:
114
+ end_date = pendulum.now()
115
+
116
+ # Ensure start_date.last_value is a pendulum.DateTime object
117
+ start_date_obj = ensure_pendulum_datetime(start_date) # type: ignore
118
+ start_date_str = start_date_obj.format("YYYY-MM-DD")
119
+
120
+ # Ensure end_date is a pendulum.DateTime object
121
+ end_date_obj = ensure_pendulum_datetime(end_date)
122
+ end_date_str = end_date_obj.format("YYYY-MM-DD")
123
+
124
+ # Compose the URL
125
+ url = f"{start_date_str}..{end_date_str}?"
126
+
127
+ if base_currency:
128
+ url += f"base={base_currency}"
129
+
130
+ # Fetch data from the API
131
+ data = get_path_with_retry(url)
132
+
133
+ # Extract base currency and rates from the API response
134
+ rates = data["rates"]
135
+
136
+ # Iterate over the rates dictionary (one entry per date)
137
+ for date, daily_rates in rates.items():
138
+ formatted_date = pendulum.parse(date)
139
+
140
+ # Add the base currency with a rate of 1.0
141
+ yield {
142
+ "date": formatted_date,
143
+ "currency_code": base_currency,
144
+ "rate": 1.0,
145
+ "base_currency": base_currency,
146
+ }
147
+
148
+ # Add all other currencies and their rates
149
+ for currency_code, rate in daily_rates.items():
150
+ yield {
151
+ "date": formatted_date,
152
+ "currency_code": currency_code,
153
+ "rate": rate,
154
+ "base_currency": base_currency,
155
+ }
156
+
157
+ return currencies, latest, exchange_rates
@@ -0,0 +1,49 @@
1
+ """Fundraiseup source for ingesting donations, events, fundraisers, recurring plans, and supporters."""
2
+
3
+ from typing import Any, Dict, Generator, Iterable
4
+
5
+ import dlt
6
+ from dlt.sources import DltResource
7
+
8
+ from .client import FundraiseupClient
9
+
10
+
11
+ @dlt.source(name="fundraiseup", max_table_nesting=0)
12
+ def fundraiseup_source(api_key: str) -> Iterable[DltResource]:
13
+ """
14
+ Return resources for Fundraiseup API.
15
+
16
+ Args:
17
+ api_key: API key for authentication
18
+
19
+ Returns:
20
+ Iterable of DLT resources
21
+ """
22
+ client = FundraiseupClient(api_key=api_key)
23
+
24
+ # Define available resources and their configurations
25
+ resources = {
26
+ "donations": {"write_disposition": "replace", "primary_key": "id"},
27
+ "events": {"write_disposition": "replace", "primary_key": "id"},
28
+ "fundraisers": {"write_disposition": "replace", "primary_key": "id"},
29
+ "recurring_plans": {"write_disposition": "replace", "primary_key": "id"},
30
+ "supporters": {"write_disposition": "replace", "primary_key": "id"},
31
+ }
32
+
33
+ def create_resource(resource_name: str, config: Dict[str, Any]) -> DltResource:
34
+ """Create a DLT resource dynamically."""
35
+
36
+ @dlt.resource(
37
+ name=resource_name,
38
+ write_disposition=config["write_disposition"],
39
+ primary_key=config["primary_key"],
40
+ )
41
+ def generic_resource() -> Generator[Dict[str, Any], None, None]:
42
+ """Generic resource that yields batches directly."""
43
+ for batch in client.get_paginated_data(resource_name):
44
+ yield batch # type: ignore[misc]
45
+
46
+ return generic_resource()
47
+
48
+ # Return all resources
49
+ return [create_resource(name, config) for name, config in resources.items()]
@@ -0,0 +1,81 @@
1
+ """Fundraiseup API Client for handling authentication and paginated requests."""
2
+
3
+ from typing import Any, Dict, Iterator, Optional
4
+
5
+ from ingestr.src.http_client import create_client
6
+
7
+
8
+ class FundraiseupClient:
9
+ """Client for interacting with Fundraiseup API v1."""
10
+
11
+ def __init__(self, api_key: str):
12
+ """
13
+ Initialize Fundraiseup API client.
14
+
15
+ Args:
16
+ api_key: API key for authentication
17
+ """
18
+ self.api_key = api_key
19
+ self.base_url = "https://api.fundraiseup.com/v1"
20
+ # Use shared HTTP client with retry logic for rate limiting
21
+ self.client = create_client(retry_status_codes=[429, 500, 502, 503, 504])
22
+
23
+ def get_paginated_data(
24
+ self,
25
+ endpoint: str,
26
+ params: Optional[Dict[str, Any]] = None,
27
+ page_size: int = 100,
28
+ ) -> Iterator[list[Dict[str, Any]]]:
29
+ """
30
+ Fetch paginated data from a Fundraiseup API endpoint using cursor-based pagination.
31
+
32
+ Args:
33
+ endpoint: API endpoint path (e.g., "donations")
34
+ params: Additional query parameters
35
+ page_size: Number of items per page (default 100)
36
+
37
+ Yields:
38
+ Batches of items from the API
39
+ """
40
+ url = f"{self.base_url}/{endpoint}"
41
+ headers = {
42
+ "Authorization": f"Bearer {self.api_key}",
43
+ "Content-Type": "application/json",
44
+ }
45
+
46
+ if params is None:
47
+ params = {}
48
+
49
+ params["limit"] = page_size
50
+ starting_after = None
51
+
52
+ while True:
53
+ # Add cursor for pagination if not first page
54
+ if starting_after:
55
+ params["starting_after"] = starting_after
56
+
57
+ response = self.client.get(url=url, headers=headers, params=params)
58
+ response.raise_for_status()
59
+
60
+ data = response.json()
61
+
62
+ # Handle both list response and object with data array
63
+ if isinstance(data, list):
64
+ items = data
65
+ has_more = len(items) == page_size
66
+ else:
67
+ items = data.get("data", [])
68
+ has_more = data.get("has_more", False)
69
+
70
+ if not items:
71
+ break
72
+
73
+ yield items
74
+
75
+ # Set cursor for next page
76
+ if has_more and items:
77
+ starting_after = items[-1].get("id")
78
+ if not starting_after:
79
+ break
80
+ else:
81
+ break
@@ -7,7 +7,7 @@ from typing import Iterator, List, Optional, Union
7
7
  import dlt
8
8
  from dlt.common import pendulum
9
9
  from dlt.common.typing import DictStrAny, TDataItem
10
- from dlt.extract import DltResource
10
+ from dlt.sources import DltResource
11
11
  from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
12
12
  from google.analytics.data_v1beta import BetaAnalyticsDataClient
13
13
  from google.analytics.data_v1beta.types import (
@@ -107,7 +107,7 @@ def mongodb_collection(
107
107
  projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
108
108
  pymongoarrow_schema: Optional[Any] = None,
109
109
  custom_query: Optional[List[Dict[str, Any]]] = None,
110
- ) -> Any:
110
+ ) -> DltResource:
111
111
  """
112
112
  A DLT source which loads a collection from a mongo database using PyMongo.
113
113
 
@@ -1,4 +1,4 @@
1
- """Mongo database source helpers"""
1
+ """Mongo database source helpers and destination utilities"""
2
2
 
3
3
  import re
4
4
  from itertools import islice
@@ -23,6 +23,7 @@ from bson.timestamp import Timestamp
23
23
  from dlt.common import logger
24
24
  from dlt.common.configuration.specs import BaseConfiguration, configspec
25
25
  from dlt.common.data_writers import TDataItemFormat
26
+ from dlt.common.schema import TTableSchema
26
27
  from dlt.common.time import ensure_pendulum_datetime
27
28
  from dlt.common.typing import TDataItem
28
29
  from dlt.common.utils import map_nested_in_place
@@ -945,3 +946,70 @@ def convert_mongo_shell_to_extended_json(query_string: str) -> str:
945
946
 
946
947
 
947
948
  __source_name__ = "mongodb"
949
+
950
+
951
+ # MongoDB destination helper functions
952
+ def process_file_items(file_path: str) -> list[dict]:
953
+ """Process items from a file path (JSONL format)."""
954
+ import json
955
+
956
+ documents = []
957
+ with open(file_path, "r") as f:
958
+ for line in f:
959
+ if line.strip():
960
+ doc = json.loads(line.strip())
961
+ documents.append(doc) # Include all fields including DLT metadata
962
+ return documents
963
+
964
+
965
+ def mongodb_insert(uri: str, database: str):
966
+ """Creates a dlt.destination for inserting data into a MongoDB collection.
967
+
968
+ Args:
969
+ uri (str): MongoDB connection URI.
970
+ database (str): Name of the MongoDB database.
971
+
972
+ Returns:
973
+ dlt.destination: A DLT destination object configured for MongoDB.
974
+ """
975
+
976
+ state = {"first_batch": True}
977
+
978
+ def destination(items: TDataItem, table: TTableSchema) -> None:
979
+ import pyarrow
980
+ from pymongo import MongoClient
981
+
982
+ # Extract database name from connection string
983
+ # Get collection name from table metadata
984
+ collection_name = table["name"]
985
+
986
+ # Connect to MongoDB
987
+ client: MongoClient
988
+
989
+ with MongoClient(uri) as client:
990
+ db = client[database]
991
+ collection = db[collection_name]
992
+
993
+ # Process and insert documents
994
+ if isinstance(items, str):
995
+ documents = process_file_items(items)
996
+ elif isinstance(items, pyarrow.RecordBatch):
997
+ documents = [item for item in items.to_pylist()]
998
+ else:
999
+ documents = [item for item in items if isinstance(item, dict)]
1000
+
1001
+ if state["first_batch"] and documents:
1002
+ collection.delete_many({})
1003
+ state["first_batch"] = False
1004
+
1005
+ if documents:
1006
+ collection.insert_many(documents) # Insert all new data
1007
+
1008
+ return dlt.destination(
1009
+ destination,
1010
+ name="mongodb",
1011
+ loader_file_format="typed-jsonl",
1012
+ batch_size=1000,
1013
+ naming_convention="snake_case",
1014
+ loader_parallelism_strategy="sequential",
1015
+ )
ingestr/src/sources.py CHANGED
@@ -237,6 +237,9 @@ class SqlSource:
237
237
  backend_kwargs: Dict[str, Any] = None, # type: ignore
238
238
  type_adapter_callback: Optional[TTypeAdapter] = None,
239
239
  included_columns: Optional[List[str]] = None,
240
+ excluded_columns: Optional[
241
+ List[str]
242
+ ] = None, # Added for dlt 1.16.0 compatibility
240
243
  query_adapter_callback: Optional[TQueryAdapter] = None,
241
244
  resolve_foreign_keys: bool = False,
242
245
  ) -> Iterator[TDataItem]:
@@ -3623,3 +3626,80 @@ class WiseSource:
3623
3626
  start_date=start_date,
3624
3627
  end_date=end_date,
3625
3628
  ).with_resources(table)
3629
+
3630
+
3631
+ class FundraiseupSource:
3632
+ def handles_incrementality(self) -> bool:
3633
+ return False
3634
+
3635
+ def dlt_source(self, uri: str, table: str, **kwargs):
3636
+ parsed_uri = urlparse(uri)
3637
+ params = parse_qs(parsed_uri.query)
3638
+
3639
+ api_key = params.get("api_key")
3640
+ if api_key is None:
3641
+ raise MissingValueError("api_key", "Fundraiseup")
3642
+
3643
+ if table not in [
3644
+ "donations",
3645
+ "events",
3646
+ "fundraisers",
3647
+ "recurring_plans",
3648
+ "supporters",
3649
+ ]:
3650
+ raise UnsupportedResourceError(table, "Fundraiseup")
3651
+
3652
+ from ingestr.src.fundraiseup import fundraiseup_source
3653
+
3654
+ return fundraiseup_source(
3655
+ api_key=api_key[0],
3656
+ ).with_resources(table)
3657
+
3658
+
3659
+ class AnthropicSource:
3660
+ def handles_incrementality(self) -> bool:
3661
+ return True
3662
+
3663
+ def dlt_source(self, uri: str, table: str, **kwargs):
3664
+ # anthropic://?api_key=<admin_api_key>
3665
+ parsed_uri = urlparse(uri)
3666
+ params = parse_qs(parsed_uri.query)
3667
+
3668
+ api_key = params.get("api_key")
3669
+ if api_key is None:
3670
+ raise MissingValueError("api_key", "Anthropic")
3671
+
3672
+ if table not in [
3673
+ "claude_code_usage",
3674
+ "usage_report",
3675
+ "cost_report",
3676
+ "organization",
3677
+ "workspaces",
3678
+ "api_keys",
3679
+ "invites",
3680
+ "users",
3681
+ "workspace_members",
3682
+ ]:
3683
+ raise UnsupportedResourceError(table, "Anthropic")
3684
+
3685
+ # Get start and end dates from kwargs
3686
+ start_date = kwargs.get("interval_start")
3687
+ if start_date:
3688
+ start_date = ensure_pendulum_datetime(start_date)
3689
+ else:
3690
+ # Default to 2023-01-01
3691
+ start_date = pendulum.datetime(2023, 1, 1)
3692
+
3693
+ end_date = kwargs.get("interval_end")
3694
+ if end_date:
3695
+ end_date = ensure_pendulum_datetime(end_date)
3696
+ else:
3697
+ end_date = None
3698
+
3699
+ from ingestr.src.anthropic import anthropic_source
3700
+
3701
+ return anthropic_source(
3702
+ api_key=api_key[0],
3703
+ initial_start_date=start_date,
3704
+ end_date=end_date,
3705
+ ).with_resources(table)
@@ -1,6 +1,6 @@
1
1
  import sys
2
2
  import unittest
3
- from unittest.mock import MagicMock, patch
3
+ from unittest.mock import patch
4
4
 
5
5
  import smartsheet # type: ignore
6
6
  from smartsheet.models import Cell, Column, Row, Sheet # type: ignore