ingestr 0.14.93__py3-none-any.whl → 0.14.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/factory.py +8 -0
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/revenuecat/__init__.py +16 -41
- ingestr/src/revenuecat/helpers.py +19 -73
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/sources.py +395 -0
- {ingestr-0.14.93.dist-info → ingestr-0.14.98.dist-info}/METADATA +2 -1
- {ingestr-0.14.93.dist-info → ingestr-0.14.98.dist-info}/RECORD +19 -10
- {ingestr-0.14.93.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
- {ingestr-0.14.93.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
- {ingestr-0.14.93.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -64,12 +64,9 @@ def _paginate(
|
|
|
64
64
|
while True:
|
|
65
65
|
data = _make_request(api_key, endpoint, current_params)
|
|
66
66
|
|
|
67
|
-
# Yield items from the current page
|
|
68
67
|
if "items" in data and data["items"] is not None:
|
|
69
|
-
|
|
70
|
-
yield item
|
|
68
|
+
yield data["items"]
|
|
71
69
|
|
|
72
|
-
# Check if there's a next page
|
|
73
70
|
if "next_page" not in data:
|
|
74
71
|
break
|
|
75
72
|
|
|
@@ -88,7 +85,6 @@ def convert_timestamps_to_iso(
|
|
|
88
85
|
"""Convert timestamp fields from milliseconds to ISO format."""
|
|
89
86
|
for field in timestamp_fields:
|
|
90
87
|
if field in record and record[field] is not None:
|
|
91
|
-
# Convert from milliseconds timestamp to ISO datetime string
|
|
92
88
|
timestamp_ms = record[field]
|
|
93
89
|
dt = pendulum.from_timestamp(timestamp_ms / 1000)
|
|
94
90
|
record[field] = dt.to_iso8601_string()
|
|
@@ -177,87 +173,37 @@ async def _paginate_async(
|
|
|
177
173
|
return items
|
|
178
174
|
|
|
179
175
|
|
|
180
|
-
async def fetch_and_process_nested_resource_async(
|
|
181
|
-
session: aiohttp.ClientSession,
|
|
182
|
-
api_key: str,
|
|
183
|
-
project_id: str,
|
|
184
|
-
customer_id: str,
|
|
185
|
-
customer: Dict[str, Any],
|
|
186
|
-
resource_name: str,
|
|
187
|
-
timestamp_fields: Optional[List[str]] = None,
|
|
188
|
-
) -> None:
|
|
189
|
-
"""
|
|
190
|
-
Fetch and process any nested resource for a customer asynchronously.
|
|
191
|
-
|
|
192
|
-
Args:
|
|
193
|
-
session: aiohttp ClientSession
|
|
194
|
-
api_key: RevenueCat API key
|
|
195
|
-
project_id: Project ID
|
|
196
|
-
customer_id: Customer ID
|
|
197
|
-
customer: Customer data dictionary to modify
|
|
198
|
-
resource_name: Name of the nested resource (e.g., 'purchases', 'subscriptions', 'events')
|
|
199
|
-
timestamp_fields: List of timestamp fields to convert to ISO format
|
|
200
|
-
"""
|
|
201
|
-
# If resource not included in customer data, fetch separately
|
|
202
|
-
if resource_name not in customer or customer[resource_name] is None:
|
|
203
|
-
endpoint = f"/projects/{project_id}/customers/{customer_id}/{resource_name}"
|
|
204
|
-
customer[resource_name] = await _paginate_async(session, api_key, endpoint)
|
|
205
|
-
|
|
206
|
-
# Convert timestamps if fields specified
|
|
207
|
-
if (
|
|
208
|
-
timestamp_fields
|
|
209
|
-
and resource_name in customer
|
|
210
|
-
and customer[resource_name] is not None
|
|
211
|
-
):
|
|
212
|
-
for item in customer[resource_name]:
|
|
213
|
-
convert_timestamps_to_iso(item, timestamp_fields)
|
|
214
|
-
|
|
215
|
-
|
|
216
176
|
async def process_customer_with_nested_resources_async(
|
|
217
177
|
session: aiohttp.ClientSession,
|
|
218
178
|
api_key: str,
|
|
219
179
|
project_id: str,
|
|
220
180
|
customer: Dict[str, Any],
|
|
221
181
|
) -> Dict[str, Any]:
|
|
222
|
-
"""
|
|
223
|
-
Process a customer and fetch nested resources concurrently.
|
|
224
|
-
|
|
225
|
-
Args:
|
|
226
|
-
session: aiohttp ClientSession
|
|
227
|
-
api_key: RevenueCat API key
|
|
228
|
-
project_id: Project ID
|
|
229
|
-
customer: Customer data to process
|
|
230
|
-
|
|
231
|
-
Returns:
|
|
232
|
-
Customer data with nested resources populated
|
|
233
|
-
"""
|
|
234
182
|
customer_id = customer["id"]
|
|
235
|
-
|
|
236
|
-
# Convert customer timestamps
|
|
237
183
|
customer = convert_timestamps_to_iso(customer, ["first_seen_at", "last_seen_at"])
|
|
238
|
-
|
|
239
|
-
# Define nested resources to fetch concurrently
|
|
240
184
|
nested_resources = [
|
|
241
185
|
("subscriptions", ["purchased_at", "expires_at", "grace_period_expires_at"]),
|
|
242
186
|
("purchases", ["purchased_at", "expires_at"]),
|
|
243
187
|
]
|
|
244
188
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
resource_name
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
189
|
+
async def fetch_and_convert(resource_name, timestamp_fields):
|
|
190
|
+
if resource_name not in customer or customer[resource_name] is None:
|
|
191
|
+
endpoint = f"/projects/{project_id}/customers/{customer_id}/{resource_name}"
|
|
192
|
+
customer[resource_name] = await _paginate_async(session, api_key, endpoint)
|
|
193
|
+
if (
|
|
194
|
+
timestamp_fields
|
|
195
|
+
and resource_name in customer
|
|
196
|
+
and customer[resource_name] is not None
|
|
197
|
+
):
|
|
198
|
+
for item in customer[resource_name]:
|
|
199
|
+
convert_timestamps_to_iso(item, timestamp_fields)
|
|
200
|
+
|
|
201
|
+
await asyncio.gather(
|
|
202
|
+
*[
|
|
203
|
+
fetch_and_convert(resource_name, timestamp_fields)
|
|
204
|
+
for resource_name, timestamp_fields in nested_resources
|
|
205
|
+
]
|
|
206
|
+
)
|
|
261
207
|
|
|
262
208
|
return customer
|
|
263
209
|
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""A source loading data from Socrata open data platform"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Iterator, Optional
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
|
|
7
|
+
from .helpers import fetch_data
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dlt.source(name="socrata", max_table_nesting=0)
|
|
11
|
+
def source(
|
|
12
|
+
domain: str,
|
|
13
|
+
dataset_id: str,
|
|
14
|
+
app_token: Optional[str] = None,
|
|
15
|
+
username: Optional[str] = None,
|
|
16
|
+
password: Optional[str] = None,
|
|
17
|
+
incremental: Optional[Any] = None,
|
|
18
|
+
primary_key: Optional[str] = None,
|
|
19
|
+
write_disposition: Optional[str] = dlt.config.value,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
A dlt source for the Socrata open data platform.
|
|
23
|
+
|
|
24
|
+
Supports both full refresh (replace) and incremental loading (merge).
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
domain: The Socrata domain (e.g., "evergreen.data.socrata.com")
|
|
28
|
+
dataset_id: The dataset identifier (e.g., "6udu-fhnu")
|
|
29
|
+
app_token: Socrata app token for higher rate limits (recommended)
|
|
30
|
+
username: Username for authentication (if dataset is private)
|
|
31
|
+
password: Password for authentication (if dataset is private)
|
|
32
|
+
incremental: DLT incremental object for incremental loading
|
|
33
|
+
primary_key: Primary key field for merge operations (default: ":id")
|
|
34
|
+
write_disposition: Write disposition ("replace", "append", "merge").
|
|
35
|
+
If not provided, automatically determined based on incremental setting.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
A dlt source with a single "dataset" resource
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
@dlt.resource(
|
|
42
|
+
write_disposition=write_disposition or "replace",
|
|
43
|
+
primary_key=primary_key, # type: ignore[call-overload]
|
|
44
|
+
)
|
|
45
|
+
def dataset(
|
|
46
|
+
incremental: Optional[dlt.sources.incremental] = incremental, # type: ignore[type-arg]
|
|
47
|
+
) -> Iterator[Dict[str, Any]]:
|
|
48
|
+
"""
|
|
49
|
+
Yields records from a Socrata dataset.
|
|
50
|
+
|
|
51
|
+
Supports both full refresh (replace) and incremental loading (merge).
|
|
52
|
+
When incremental is provided, filters data using SoQL WHERE clause on the server side.
|
|
53
|
+
|
|
54
|
+
Yields:
|
|
55
|
+
Dict[str, Any]: Individual records from the dataset
|
|
56
|
+
"""
|
|
57
|
+
fetch_kwargs: Dict[str, Any] = {
|
|
58
|
+
"domain": domain,
|
|
59
|
+
"dataset_id": dataset_id,
|
|
60
|
+
"app_token": app_token,
|
|
61
|
+
"username": username,
|
|
62
|
+
"password": password,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
if incremental and incremental.cursor_path:
|
|
66
|
+
fetch_kwargs["incremental_key"] = incremental.cursor_path
|
|
67
|
+
fetch_kwargs["start_value"] = (
|
|
68
|
+
str(incremental.last_value)
|
|
69
|
+
if incremental.last_value is not None
|
|
70
|
+
else None
|
|
71
|
+
)
|
|
72
|
+
if getattr(incremental, "end_value", None) is not None:
|
|
73
|
+
ev = incremental.end_value # type: ignore[attr-defined]
|
|
74
|
+
fetch_kwargs["end_value"] = (
|
|
75
|
+
ev.isoformat() # type: ignore[union-attr]
|
|
76
|
+
if hasattr(ev, "isoformat")
|
|
77
|
+
else str(ev)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Fetch and yield records
|
|
81
|
+
yield from fetch_data(**fetch_kwargs)
|
|
82
|
+
|
|
83
|
+
return (dataset,)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Socrata API helpers"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Iterator, Optional
|
|
4
|
+
|
|
5
|
+
from dlt.sources.helpers import requests
|
|
6
|
+
|
|
7
|
+
from .settings import DEFAULT_PAGE_SIZE, REQUEST_TIMEOUT
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def fetch_data(
|
|
11
|
+
domain: str,
|
|
12
|
+
dataset_id: str,
|
|
13
|
+
app_token: Optional[str] = None,
|
|
14
|
+
username: Optional[str] = None,
|
|
15
|
+
password: Optional[str] = None,
|
|
16
|
+
incremental_key: Optional[str] = None,
|
|
17
|
+
start_value: Optional[str] = None,
|
|
18
|
+
end_value: Optional[str] = None,
|
|
19
|
+
) -> Iterator[Dict[str, Any]]:
|
|
20
|
+
"""
|
|
21
|
+
Fetch records from Socrata dataset with pagination and optional filtering.
|
|
22
|
+
|
|
23
|
+
Uses offset-based pagination to get all records, not just first 50000.
|
|
24
|
+
Supports incremental loading via SoQL WHERE clause for server-side filtering.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
domain: Socrata domain (e.g., "data.seattle.gov")
|
|
28
|
+
dataset_id: Dataset identifier (e.g., "6udu-fhnu")
|
|
29
|
+
app_token: Socrata app token for higher rate limits
|
|
30
|
+
username: Username for authentication
|
|
31
|
+
password: Password for authentication
|
|
32
|
+
start_value: Minimum value for incremental_key (inclusive)
|
|
33
|
+
end_value: Maximum value for incremental_key (exclusive)
|
|
34
|
+
|
|
35
|
+
Yields:
|
|
36
|
+
Lists of records (one list per page)
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
requests.HTTPError: If API request fails
|
|
40
|
+
"""
|
|
41
|
+
url = f"https://{domain}/resource/{dataset_id}.json"
|
|
42
|
+
|
|
43
|
+
headers = {"Accept": "application/json"}
|
|
44
|
+
if app_token:
|
|
45
|
+
headers["X-App-Token"] = app_token
|
|
46
|
+
|
|
47
|
+
auth = (username, password) if username and password else None
|
|
48
|
+
|
|
49
|
+
limit = DEFAULT_PAGE_SIZE
|
|
50
|
+
offset = 0
|
|
51
|
+
|
|
52
|
+
while True:
|
|
53
|
+
params: Dict[str, Any] = {"$limit": limit, "$offset": offset}
|
|
54
|
+
|
|
55
|
+
if incremental_key and start_value:
|
|
56
|
+
start_value_iso = str(start_value).replace(" ", "T")
|
|
57
|
+
where_conditions = [f"{incremental_key} >= '{start_value_iso}'"]
|
|
58
|
+
|
|
59
|
+
if end_value:
|
|
60
|
+
end_value_iso = str(end_value).replace(" ", "T")
|
|
61
|
+
where_conditions.append(f"{incremental_key} < '{end_value_iso}'")
|
|
62
|
+
|
|
63
|
+
params["$where"] = " AND ".join(where_conditions)
|
|
64
|
+
params["$order"] = f"{incremental_key} ASC"
|
|
65
|
+
|
|
66
|
+
response = requests.get(
|
|
67
|
+
url,
|
|
68
|
+
headers=headers,
|
|
69
|
+
auth=auth,
|
|
70
|
+
params=params,
|
|
71
|
+
timeout=REQUEST_TIMEOUT,
|
|
72
|
+
)
|
|
73
|
+
response.raise_for_status()
|
|
74
|
+
|
|
75
|
+
data = response.json()
|
|
76
|
+
|
|
77
|
+
if not data:
|
|
78
|
+
break
|
|
79
|
+
|
|
80
|
+
yield data
|
|
81
|
+
|
|
82
|
+
if len(data) < limit:
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
offset += limit
|