ingestr 0.13.94__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/destinations.py +142 -0
- ingestr/src/docebo/__init__.py +28 -46
- ingestr/src/elasticsearch/helpers.py +112 -0
- ingestr/src/facebook_ads/__init__.py +6 -1
- ingestr/src/factory.py +12 -0
- ingestr/src/fluxx/__init__.py +4033 -11246
- ingestr/src/fluxx/helpers.py +0 -7
- ingestr/src/frankfurter/__init__.py +157 -157
- ingestr/src/fundraiseup/__init__.py +49 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/google_analytics/__init__.py +1 -1
- ingestr/src/mongodb/__init__.py +1 -1
- ingestr/src/mongodb/helpers.py +69 -1
- ingestr/src/sources.py +80 -0
- ingestr/tests/unit/test_smartsheets.py +1 -1
- {ingestr-0.13.94.dist-info → ingestr-0.14.1.dist-info}/METADATA +12 -7
- {ingestr-0.13.94.dist-info → ingestr-0.14.1.dist-info}/RECORD +24 -19
- {ingestr-0.13.94.dist-info → ingestr-0.14.1.dist-info}/WHEEL +0 -0
- {ingestr-0.13.94.dist-info → ingestr-0.14.1.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.94.dist-info → ingestr-0.14.1.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/fluxx/helpers.py
CHANGED
|
@@ -43,12 +43,6 @@ def fluxx_api_request(
|
|
|
43
43
|
"Authorization": f"Bearer {access_token}",
|
|
44
44
|
"Content-Type": "application/json",
|
|
45
45
|
}
|
|
46
|
-
# print(f"Making request to Fluxx API:")
|
|
47
|
-
# print(f" Method: {method}")
|
|
48
|
-
# print(f" URL: {url}")
|
|
49
|
-
# print(f" Headers: {headers}")
|
|
50
|
-
# print(f" Params: {params}")
|
|
51
|
-
# print(f" Data: {data}")
|
|
52
46
|
|
|
53
47
|
response = requests.request(
|
|
54
48
|
method=method,
|
|
@@ -88,7 +82,6 @@ def paginate_fluxx_resource(
|
|
|
88
82
|
params=params,
|
|
89
83
|
)
|
|
90
84
|
|
|
91
|
-
print("resssponse", response)
|
|
92
85
|
if not response:
|
|
93
86
|
break
|
|
94
87
|
|
|
@@ -1,157 +1,157 @@
|
|
|
1
|
-
from typing import Any, Iterator, Optional
|
|
2
|
-
|
|
3
|
-
import dlt
|
|
4
|
-
from dlt.common.pendulum import pendulum
|
|
5
|
-
from dlt.common.time import ensure_pendulum_datetime
|
|
6
|
-
from dlt.common.typing import TAnyDateTime
|
|
7
|
-
|
|
8
|
-
from ingestr.src.frankfurter.helpers import get_path_with_retry
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@dlt.source(
|
|
12
|
-
name="frankfurter",
|
|
13
|
-
max_table_nesting=0,
|
|
14
|
-
)
|
|
15
|
-
def frankfurter_source(
|
|
16
|
-
start_date: TAnyDateTime,
|
|
17
|
-
end_date: TAnyDateTime | None,
|
|
18
|
-
base_currency: str,
|
|
19
|
-
) -> Any:
|
|
20
|
-
"""
|
|
21
|
-
A dlt source for the frankfurter.dev API. It groups several resources (in this case frankfurter.dev API endpoints) containing
|
|
22
|
-
various types of data: currencies, latest rates, historical rates.
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
@dlt.resource(
|
|
26
|
-
write_disposition="replace",
|
|
27
|
-
)
|
|
28
|
-
def currencies() -> Iterator[dict]:
|
|
29
|
-
"""
|
|
30
|
-
Yields each currency as a separate row with two columns: currency_code and currency_name.
|
|
31
|
-
"""
|
|
32
|
-
# Retrieve the list of currencies from the API
|
|
33
|
-
currencies_data = get_path_with_retry("currencies")
|
|
34
|
-
|
|
35
|
-
for currency_code, currency_name in currencies_data.items():
|
|
36
|
-
yield {"currency_code": currency_code, "currency_name": currency_name}
|
|
37
|
-
|
|
38
|
-
@dlt.resource(
|
|
39
|
-
write_disposition="merge",
|
|
40
|
-
columns={
|
|
41
|
-
"date": {"data_type": "text"},
|
|
42
|
-
"currency_code": {"data_type": "text"},
|
|
43
|
-
"rate": {"data_type": "double"},
|
|
44
|
-
"base_currency": {"data_type": "text"},
|
|
45
|
-
},
|
|
46
|
-
primary_key=["date", "currency_code", "base_currency"],
|
|
47
|
-
)
|
|
48
|
-
def latest(base_currency: Optional[str] = "") -> Iterator[dict]:
|
|
49
|
-
"""
|
|
50
|
-
Fetches the latest exchange rates and yields them as rows.
|
|
51
|
-
"""
|
|
52
|
-
# Base URL
|
|
53
|
-
url = "latest?"
|
|
54
|
-
|
|
55
|
-
if base_currency:
|
|
56
|
-
url += f"base={base_currency}"
|
|
57
|
-
|
|
58
|
-
# Fetch data
|
|
59
|
-
data = get_path_with_retry(url)
|
|
60
|
-
|
|
61
|
-
# Extract rates and base currency
|
|
62
|
-
rates = data["rates"]
|
|
63
|
-
date = pendulum.parse(data["date"])
|
|
64
|
-
|
|
65
|
-
# Add the base currency with a rate of 1.0
|
|
66
|
-
yield {
|
|
67
|
-
"date": date,
|
|
68
|
-
"currency_code": base_currency,
|
|
69
|
-
"rate": 1.0,
|
|
70
|
-
"base_currency": base_currency,
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
# Add all currencies and their rates
|
|
74
|
-
for currency_code, rate in rates.items():
|
|
75
|
-
yield {
|
|
76
|
-
"date": date,
|
|
77
|
-
"currency_code": currency_code,
|
|
78
|
-
"rate": rate,
|
|
79
|
-
"base_currency": base_currency,
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
@dlt.resource(
|
|
83
|
-
write_disposition="merge",
|
|
84
|
-
columns={
|
|
85
|
-
"date": {"data_type": "text"},
|
|
86
|
-
"currency_code": {"data_type": "text"},
|
|
87
|
-
"rate": {"data_type": "double"},
|
|
88
|
-
"base_currency": {"data_type": "text"},
|
|
89
|
-
},
|
|
90
|
-
primary_key=("date", "currency_code", "base_currency"),
|
|
91
|
-
)
|
|
92
|
-
def exchange_rates(
|
|
93
|
-
date_time=dlt.sources.incremental(
|
|
94
|
-
"date",
|
|
95
|
-
initial_value=start_date,
|
|
96
|
-
end_value=end_date,
|
|
97
|
-
range_start="closed",
|
|
98
|
-
range_end="closed",
|
|
99
|
-
),
|
|
100
|
-
) -> Iterator[dict]:
|
|
101
|
-
"""
|
|
102
|
-
Fetches exchange rates for a specified date range.
|
|
103
|
-
If only start_date is provided, fetches data until now.
|
|
104
|
-
If both start_date and end_date are provided, fetches data for each day in the range.
|
|
105
|
-
"""
|
|
106
|
-
if date_time.last_value is not None:
|
|
107
|
-
start_date = date_time.last_value
|
|
108
|
-
else:
|
|
109
|
-
start_date = start_date
|
|
110
|
-
|
|
111
|
-
if date_time.end_value is not None:
|
|
112
|
-
end_date = date_time.end_value
|
|
113
|
-
else:
|
|
114
|
-
end_date = pendulum.now()
|
|
115
|
-
|
|
116
|
-
# Ensure start_date.last_value is a pendulum.DateTime object
|
|
117
|
-
start_date_obj = ensure_pendulum_datetime(start_date) # type: ignore
|
|
118
|
-
start_date_str = start_date_obj.format("YYYY-MM-DD")
|
|
119
|
-
|
|
120
|
-
# Ensure end_date is a pendulum.DateTime object
|
|
121
|
-
end_date_obj = ensure_pendulum_datetime(end_date)
|
|
122
|
-
end_date_str = end_date_obj.format("YYYY-MM-DD")
|
|
123
|
-
|
|
124
|
-
# Compose the URL
|
|
125
|
-
url = f"{start_date_str}..{end_date_str}?"
|
|
126
|
-
|
|
127
|
-
if base_currency:
|
|
128
|
-
url += f"base={base_currency}"
|
|
129
|
-
|
|
130
|
-
# Fetch data from the API
|
|
131
|
-
data = get_path_with_retry(url)
|
|
132
|
-
|
|
133
|
-
# Extract base currency and rates from the API response
|
|
134
|
-
rates = data["rates"]
|
|
135
|
-
|
|
136
|
-
# Iterate over the rates dictionary (one entry per date)
|
|
137
|
-
for date, daily_rates in rates.items():
|
|
138
|
-
formatted_date = pendulum.parse(date)
|
|
139
|
-
|
|
140
|
-
# Add the base currency with a rate of 1.0
|
|
141
|
-
yield {
|
|
142
|
-
"date": formatted_date,
|
|
143
|
-
"currency_code": base_currency,
|
|
144
|
-
"rate": 1.0,
|
|
145
|
-
"base_currency": base_currency,
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
# Add all other currencies and their rates
|
|
149
|
-
for currency_code, rate in daily_rates.items():
|
|
150
|
-
yield {
|
|
151
|
-
"date": formatted_date,
|
|
152
|
-
"currency_code": currency_code,
|
|
153
|
-
"rate": rate,
|
|
154
|
-
"base_currency": base_currency,
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
return currencies, latest, exchange_rates
|
|
1
|
+
from typing import Any, Iterator, Optional
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
from dlt.common.pendulum import pendulum
|
|
5
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
6
|
+
from dlt.common.typing import TAnyDateTime
|
|
7
|
+
|
|
8
|
+
from ingestr.src.frankfurter.helpers import get_path_with_retry
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dlt.source(
|
|
12
|
+
name="frankfurter",
|
|
13
|
+
max_table_nesting=0,
|
|
14
|
+
)
|
|
15
|
+
def frankfurter_source(
|
|
16
|
+
start_date: TAnyDateTime,
|
|
17
|
+
end_date: TAnyDateTime | None,
|
|
18
|
+
base_currency: str,
|
|
19
|
+
) -> Any:
|
|
20
|
+
"""
|
|
21
|
+
A dlt source for the frankfurter.dev API. It groups several resources (in this case frankfurter.dev API endpoints) containing
|
|
22
|
+
various types of data: currencies, latest rates, historical rates.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
@dlt.resource(
|
|
26
|
+
write_disposition="replace",
|
|
27
|
+
)
|
|
28
|
+
def currencies() -> Iterator[dict]:
|
|
29
|
+
"""
|
|
30
|
+
Yields each currency as a separate row with two columns: currency_code and currency_name.
|
|
31
|
+
"""
|
|
32
|
+
# Retrieve the list of currencies from the API
|
|
33
|
+
currencies_data = get_path_with_retry("currencies")
|
|
34
|
+
|
|
35
|
+
for currency_code, currency_name in currencies_data.items():
|
|
36
|
+
yield {"currency_code": currency_code, "currency_name": currency_name}
|
|
37
|
+
|
|
38
|
+
@dlt.resource(
|
|
39
|
+
write_disposition="merge",
|
|
40
|
+
columns={
|
|
41
|
+
"date": {"data_type": "text"},
|
|
42
|
+
"currency_code": {"data_type": "text"},
|
|
43
|
+
"rate": {"data_type": "double"},
|
|
44
|
+
"base_currency": {"data_type": "text"},
|
|
45
|
+
},
|
|
46
|
+
primary_key=["date", "currency_code", "base_currency"],
|
|
47
|
+
)
|
|
48
|
+
def latest(base_currency: Optional[str] = "") -> Iterator[dict]:
|
|
49
|
+
"""
|
|
50
|
+
Fetches the latest exchange rates and yields them as rows.
|
|
51
|
+
"""
|
|
52
|
+
# Base URL
|
|
53
|
+
url = "latest?"
|
|
54
|
+
|
|
55
|
+
if base_currency:
|
|
56
|
+
url += f"base={base_currency}"
|
|
57
|
+
|
|
58
|
+
# Fetch data
|
|
59
|
+
data = get_path_with_retry(url)
|
|
60
|
+
|
|
61
|
+
# Extract rates and base currency
|
|
62
|
+
rates = data["rates"]
|
|
63
|
+
date = pendulum.parse(data["date"])
|
|
64
|
+
|
|
65
|
+
# Add the base currency with a rate of 1.0
|
|
66
|
+
yield {
|
|
67
|
+
"date": date,
|
|
68
|
+
"currency_code": base_currency,
|
|
69
|
+
"rate": 1.0,
|
|
70
|
+
"base_currency": base_currency,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Add all currencies and their rates
|
|
74
|
+
for currency_code, rate in rates.items():
|
|
75
|
+
yield {
|
|
76
|
+
"date": date,
|
|
77
|
+
"currency_code": currency_code,
|
|
78
|
+
"rate": rate,
|
|
79
|
+
"base_currency": base_currency,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
@dlt.resource(
|
|
83
|
+
write_disposition="merge",
|
|
84
|
+
columns={
|
|
85
|
+
"date": {"data_type": "text"},
|
|
86
|
+
"currency_code": {"data_type": "text"},
|
|
87
|
+
"rate": {"data_type": "double"},
|
|
88
|
+
"base_currency": {"data_type": "text"},
|
|
89
|
+
},
|
|
90
|
+
primary_key=("date", "currency_code", "base_currency"),
|
|
91
|
+
)
|
|
92
|
+
def exchange_rates(
|
|
93
|
+
date_time=dlt.sources.incremental(
|
|
94
|
+
"date",
|
|
95
|
+
initial_value=start_date,
|
|
96
|
+
end_value=end_date,
|
|
97
|
+
range_start="closed",
|
|
98
|
+
range_end="closed",
|
|
99
|
+
),
|
|
100
|
+
) -> Iterator[dict]:
|
|
101
|
+
"""
|
|
102
|
+
Fetches exchange rates for a specified date range.
|
|
103
|
+
If only start_date is provided, fetches data until now.
|
|
104
|
+
If both start_date and end_date are provided, fetches data for each day in the range.
|
|
105
|
+
"""
|
|
106
|
+
if date_time.last_value is not None:
|
|
107
|
+
start_date = date_time.last_value
|
|
108
|
+
else:
|
|
109
|
+
start_date = start_date
|
|
110
|
+
|
|
111
|
+
if date_time.end_value is not None:
|
|
112
|
+
end_date = date_time.end_value
|
|
113
|
+
else:
|
|
114
|
+
end_date = pendulum.now()
|
|
115
|
+
|
|
116
|
+
# Ensure start_date.last_value is a pendulum.DateTime object
|
|
117
|
+
start_date_obj = ensure_pendulum_datetime(start_date) # type: ignore
|
|
118
|
+
start_date_str = start_date_obj.format("YYYY-MM-DD")
|
|
119
|
+
|
|
120
|
+
# Ensure end_date is a pendulum.DateTime object
|
|
121
|
+
end_date_obj = ensure_pendulum_datetime(end_date)
|
|
122
|
+
end_date_str = end_date_obj.format("YYYY-MM-DD")
|
|
123
|
+
|
|
124
|
+
# Compose the URL
|
|
125
|
+
url = f"{start_date_str}..{end_date_str}?"
|
|
126
|
+
|
|
127
|
+
if base_currency:
|
|
128
|
+
url += f"base={base_currency}"
|
|
129
|
+
|
|
130
|
+
# Fetch data from the API
|
|
131
|
+
data = get_path_with_retry(url)
|
|
132
|
+
|
|
133
|
+
# Extract base currency and rates from the API response
|
|
134
|
+
rates = data["rates"]
|
|
135
|
+
|
|
136
|
+
# Iterate over the rates dictionary (one entry per date)
|
|
137
|
+
for date, daily_rates in rates.items():
|
|
138
|
+
formatted_date = pendulum.parse(date)
|
|
139
|
+
|
|
140
|
+
# Add the base currency with a rate of 1.0
|
|
141
|
+
yield {
|
|
142
|
+
"date": formatted_date,
|
|
143
|
+
"currency_code": base_currency,
|
|
144
|
+
"rate": 1.0,
|
|
145
|
+
"base_currency": base_currency,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# Add all other currencies and their rates
|
|
149
|
+
for currency_code, rate in daily_rates.items():
|
|
150
|
+
yield {
|
|
151
|
+
"date": formatted_date,
|
|
152
|
+
"currency_code": currency_code,
|
|
153
|
+
"rate": rate,
|
|
154
|
+
"base_currency": base_currency,
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return currencies, latest, exchange_rates
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Fundraiseup source for ingesting donations, events, fundraisers, recurring plans, and supporters."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Generator, Iterable
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from dlt.sources import DltResource
|
|
7
|
+
|
|
8
|
+
from .client import FundraiseupClient
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dlt.source(name="fundraiseup", max_table_nesting=0)
|
|
12
|
+
def fundraiseup_source(api_key: str) -> Iterable[DltResource]:
|
|
13
|
+
"""
|
|
14
|
+
Return resources for Fundraiseup API.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
api_key: API key for authentication
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Iterable of DLT resources
|
|
21
|
+
"""
|
|
22
|
+
client = FundraiseupClient(api_key=api_key)
|
|
23
|
+
|
|
24
|
+
# Define available resources and their configurations
|
|
25
|
+
resources = {
|
|
26
|
+
"donations": {"write_disposition": "replace", "primary_key": "id"},
|
|
27
|
+
"events": {"write_disposition": "replace", "primary_key": "id"},
|
|
28
|
+
"fundraisers": {"write_disposition": "replace", "primary_key": "id"},
|
|
29
|
+
"recurring_plans": {"write_disposition": "replace", "primary_key": "id"},
|
|
30
|
+
"supporters": {"write_disposition": "replace", "primary_key": "id"},
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def create_resource(resource_name: str, config: Dict[str, Any]) -> DltResource:
|
|
34
|
+
"""Create a DLT resource dynamically."""
|
|
35
|
+
|
|
36
|
+
@dlt.resource(
|
|
37
|
+
name=resource_name,
|
|
38
|
+
write_disposition=config["write_disposition"],
|
|
39
|
+
primary_key=config["primary_key"],
|
|
40
|
+
)
|
|
41
|
+
def generic_resource() -> Generator[Dict[str, Any], None, None]:
|
|
42
|
+
"""Generic resource that yields batches directly."""
|
|
43
|
+
for batch in client.get_paginated_data(resource_name):
|
|
44
|
+
yield batch # type: ignore[misc]
|
|
45
|
+
|
|
46
|
+
return generic_resource()
|
|
47
|
+
|
|
48
|
+
# Return all resources
|
|
49
|
+
return [create_resource(name, config) for name, config in resources.items()]
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Fundraiseup API Client for handling authentication and paginated requests."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Iterator, Optional
|
|
4
|
+
|
|
5
|
+
from ingestr.src.http_client import create_client
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FundraiseupClient:
|
|
9
|
+
"""Client for interacting with Fundraiseup API v1."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, api_key: str):
|
|
12
|
+
"""
|
|
13
|
+
Initialize Fundraiseup API client.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
api_key: API key for authentication
|
|
17
|
+
"""
|
|
18
|
+
self.api_key = api_key
|
|
19
|
+
self.base_url = "https://api.fundraiseup.com/v1"
|
|
20
|
+
# Use shared HTTP client with retry logic for rate limiting
|
|
21
|
+
self.client = create_client(retry_status_codes=[429, 500, 502, 503, 504])
|
|
22
|
+
|
|
23
|
+
def get_paginated_data(
|
|
24
|
+
self,
|
|
25
|
+
endpoint: str,
|
|
26
|
+
params: Optional[Dict[str, Any]] = None,
|
|
27
|
+
page_size: int = 100,
|
|
28
|
+
) -> Iterator[list[Dict[str, Any]]]:
|
|
29
|
+
"""
|
|
30
|
+
Fetch paginated data from a Fundraiseup API endpoint using cursor-based pagination.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
endpoint: API endpoint path (e.g., "donations")
|
|
34
|
+
params: Additional query parameters
|
|
35
|
+
page_size: Number of items per page (default 100)
|
|
36
|
+
|
|
37
|
+
Yields:
|
|
38
|
+
Batches of items from the API
|
|
39
|
+
"""
|
|
40
|
+
url = f"{self.base_url}/{endpoint}"
|
|
41
|
+
headers = {
|
|
42
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
43
|
+
"Content-Type": "application/json",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if params is None:
|
|
47
|
+
params = {}
|
|
48
|
+
|
|
49
|
+
params["limit"] = page_size
|
|
50
|
+
starting_after = None
|
|
51
|
+
|
|
52
|
+
while True:
|
|
53
|
+
# Add cursor for pagination if not first page
|
|
54
|
+
if starting_after:
|
|
55
|
+
params["starting_after"] = starting_after
|
|
56
|
+
|
|
57
|
+
response = self.client.get(url=url, headers=headers, params=params)
|
|
58
|
+
response.raise_for_status()
|
|
59
|
+
|
|
60
|
+
data = response.json()
|
|
61
|
+
|
|
62
|
+
# Handle both list response and object with data array
|
|
63
|
+
if isinstance(data, list):
|
|
64
|
+
items = data
|
|
65
|
+
has_more = len(items) == page_size
|
|
66
|
+
else:
|
|
67
|
+
items = data.get("data", [])
|
|
68
|
+
has_more = data.get("has_more", False)
|
|
69
|
+
|
|
70
|
+
if not items:
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
yield items
|
|
74
|
+
|
|
75
|
+
# Set cursor for next page
|
|
76
|
+
if has_more and items:
|
|
77
|
+
starting_after = items[-1].get("id")
|
|
78
|
+
if not starting_after:
|
|
79
|
+
break
|
|
80
|
+
else:
|
|
81
|
+
break
|
|
@@ -7,7 +7,7 @@ from typing import Iterator, List, Optional, Union
|
|
|
7
7
|
import dlt
|
|
8
8
|
from dlt.common import pendulum
|
|
9
9
|
from dlt.common.typing import DictStrAny, TDataItem
|
|
10
|
-
from dlt.
|
|
10
|
+
from dlt.sources import DltResource
|
|
11
11
|
from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
|
|
12
12
|
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
13
13
|
from google.analytics.data_v1beta.types import (
|
ingestr/src/mongodb/__init__.py
CHANGED
|
@@ -107,7 +107,7 @@ def mongodb_collection(
|
|
|
107
107
|
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
|
|
108
108
|
pymongoarrow_schema: Optional[Any] = None,
|
|
109
109
|
custom_query: Optional[List[Dict[str, Any]]] = None,
|
|
110
|
-
) ->
|
|
110
|
+
) -> DltResource:
|
|
111
111
|
"""
|
|
112
112
|
A DLT source which loads a collection from a mongo database using PyMongo.
|
|
113
113
|
|
ingestr/src/mongodb/helpers.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Mongo database source helpers"""
|
|
1
|
+
"""Mongo database source helpers and destination utilities"""
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from itertools import islice
|
|
@@ -23,6 +23,7 @@ from bson.timestamp import Timestamp
|
|
|
23
23
|
from dlt.common import logger
|
|
24
24
|
from dlt.common.configuration.specs import BaseConfiguration, configspec
|
|
25
25
|
from dlt.common.data_writers import TDataItemFormat
|
|
26
|
+
from dlt.common.schema import TTableSchema
|
|
26
27
|
from dlt.common.time import ensure_pendulum_datetime
|
|
27
28
|
from dlt.common.typing import TDataItem
|
|
28
29
|
from dlt.common.utils import map_nested_in_place
|
|
@@ -945,3 +946,70 @@ def convert_mongo_shell_to_extended_json(query_string: str) -> str:
|
|
|
945
946
|
|
|
946
947
|
|
|
947
948
|
__source_name__ = "mongodb"
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
# MongoDB destination helper functions
|
|
952
|
+
def process_file_items(file_path: str) -> list[dict]:
|
|
953
|
+
"""Process items from a file path (JSONL format)."""
|
|
954
|
+
import json
|
|
955
|
+
|
|
956
|
+
documents = []
|
|
957
|
+
with open(file_path, "r") as f:
|
|
958
|
+
for line in f:
|
|
959
|
+
if line.strip():
|
|
960
|
+
doc = json.loads(line.strip())
|
|
961
|
+
documents.append(doc) # Include all fields including DLT metadata
|
|
962
|
+
return documents
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
def mongodb_insert(uri: str, database: str):
|
|
966
|
+
"""Creates a dlt.destination for inserting data into a MongoDB collection.
|
|
967
|
+
|
|
968
|
+
Args:
|
|
969
|
+
uri (str): MongoDB connection URI.
|
|
970
|
+
database (str): Name of the MongoDB database.
|
|
971
|
+
|
|
972
|
+
Returns:
|
|
973
|
+
dlt.destination: A DLT destination object configured for MongoDB.
|
|
974
|
+
"""
|
|
975
|
+
|
|
976
|
+
state = {"first_batch": True}
|
|
977
|
+
|
|
978
|
+
def destination(items: TDataItem, table: TTableSchema) -> None:
|
|
979
|
+
import pyarrow
|
|
980
|
+
from pymongo import MongoClient
|
|
981
|
+
|
|
982
|
+
# Extract database name from connection string
|
|
983
|
+
# Get collection name from table metadata
|
|
984
|
+
collection_name = table["name"]
|
|
985
|
+
|
|
986
|
+
# Connect to MongoDB
|
|
987
|
+
client: MongoClient
|
|
988
|
+
|
|
989
|
+
with MongoClient(uri) as client:
|
|
990
|
+
db = client[database]
|
|
991
|
+
collection = db[collection_name]
|
|
992
|
+
|
|
993
|
+
# Process and insert documents
|
|
994
|
+
if isinstance(items, str):
|
|
995
|
+
documents = process_file_items(items)
|
|
996
|
+
elif isinstance(items, pyarrow.RecordBatch):
|
|
997
|
+
documents = [item for item in items.to_pylist()]
|
|
998
|
+
else:
|
|
999
|
+
documents = [item for item in items if isinstance(item, dict)]
|
|
1000
|
+
|
|
1001
|
+
if state["first_batch"] and documents:
|
|
1002
|
+
collection.delete_many({})
|
|
1003
|
+
state["first_batch"] = False
|
|
1004
|
+
|
|
1005
|
+
if documents:
|
|
1006
|
+
collection.insert_many(documents) # Insert all new data
|
|
1007
|
+
|
|
1008
|
+
return dlt.destination(
|
|
1009
|
+
destination,
|
|
1010
|
+
name="mongodb",
|
|
1011
|
+
loader_file_format="typed-jsonl",
|
|
1012
|
+
batch_size=1000,
|
|
1013
|
+
naming_convention="snake_case",
|
|
1014
|
+
loader_parallelism_strategy="sequential",
|
|
1015
|
+
)
|
ingestr/src/sources.py
CHANGED
|
@@ -237,6 +237,9 @@ class SqlSource:
|
|
|
237
237
|
backend_kwargs: Dict[str, Any] = None, # type: ignore
|
|
238
238
|
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
239
239
|
included_columns: Optional[List[str]] = None,
|
|
240
|
+
excluded_columns: Optional[
|
|
241
|
+
List[str]
|
|
242
|
+
] = None, # Added for dlt 1.16.0 compatibility
|
|
240
243
|
query_adapter_callback: Optional[TQueryAdapter] = None,
|
|
241
244
|
resolve_foreign_keys: bool = False,
|
|
242
245
|
) -> Iterator[TDataItem]:
|
|
@@ -3623,3 +3626,80 @@ class WiseSource:
|
|
|
3623
3626
|
start_date=start_date,
|
|
3624
3627
|
end_date=end_date,
|
|
3625
3628
|
).with_resources(table)
|
|
3629
|
+
|
|
3630
|
+
|
|
3631
|
+
class FundraiseupSource:
|
|
3632
|
+
def handles_incrementality(self) -> bool:
|
|
3633
|
+
return False
|
|
3634
|
+
|
|
3635
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3636
|
+
parsed_uri = urlparse(uri)
|
|
3637
|
+
params = parse_qs(parsed_uri.query)
|
|
3638
|
+
|
|
3639
|
+
api_key = params.get("api_key")
|
|
3640
|
+
if api_key is None:
|
|
3641
|
+
raise MissingValueError("api_key", "Fundraiseup")
|
|
3642
|
+
|
|
3643
|
+
if table not in [
|
|
3644
|
+
"donations",
|
|
3645
|
+
"events",
|
|
3646
|
+
"fundraisers",
|
|
3647
|
+
"recurring_plans",
|
|
3648
|
+
"supporters",
|
|
3649
|
+
]:
|
|
3650
|
+
raise UnsupportedResourceError(table, "Fundraiseup")
|
|
3651
|
+
|
|
3652
|
+
from ingestr.src.fundraiseup import fundraiseup_source
|
|
3653
|
+
|
|
3654
|
+
return fundraiseup_source(
|
|
3655
|
+
api_key=api_key[0],
|
|
3656
|
+
).with_resources(table)
|
|
3657
|
+
|
|
3658
|
+
|
|
3659
|
+
class AnthropicSource:
|
|
3660
|
+
def handles_incrementality(self) -> bool:
|
|
3661
|
+
return True
|
|
3662
|
+
|
|
3663
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3664
|
+
# anthropic://?api_key=<admin_api_key>
|
|
3665
|
+
parsed_uri = urlparse(uri)
|
|
3666
|
+
params = parse_qs(parsed_uri.query)
|
|
3667
|
+
|
|
3668
|
+
api_key = params.get("api_key")
|
|
3669
|
+
if api_key is None:
|
|
3670
|
+
raise MissingValueError("api_key", "Anthropic")
|
|
3671
|
+
|
|
3672
|
+
if table not in [
|
|
3673
|
+
"claude_code_usage",
|
|
3674
|
+
"usage_report",
|
|
3675
|
+
"cost_report",
|
|
3676
|
+
"organization",
|
|
3677
|
+
"workspaces",
|
|
3678
|
+
"api_keys",
|
|
3679
|
+
"invites",
|
|
3680
|
+
"users",
|
|
3681
|
+
"workspace_members",
|
|
3682
|
+
]:
|
|
3683
|
+
raise UnsupportedResourceError(table, "Anthropic")
|
|
3684
|
+
|
|
3685
|
+
# Get start and end dates from kwargs
|
|
3686
|
+
start_date = kwargs.get("interval_start")
|
|
3687
|
+
if start_date:
|
|
3688
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3689
|
+
else:
|
|
3690
|
+
# Default to 2023-01-01
|
|
3691
|
+
start_date = pendulum.datetime(2023, 1, 1)
|
|
3692
|
+
|
|
3693
|
+
end_date = kwargs.get("interval_end")
|
|
3694
|
+
if end_date:
|
|
3695
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3696
|
+
else:
|
|
3697
|
+
end_date = None
|
|
3698
|
+
|
|
3699
|
+
from ingestr.src.anthropic import anthropic_source
|
|
3700
|
+
|
|
3701
|
+
return anthropic_source(
|
|
3702
|
+
api_key=api_key[0],
|
|
3703
|
+
initial_start_date=start_date,
|
|
3704
|
+
end_date=end_date,
|
|
3705
|
+
).with_resources(table)
|