ingestr 0.14.93__py3-none-any.whl → 0.14.98__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -64,12 +64,9 @@ def _paginate(
64
64
  while True:
65
65
  data = _make_request(api_key, endpoint, current_params)
66
66
 
67
- # Yield items from the current page
68
67
  if "items" in data and data["items"] is not None:
69
- for item in data["items"]:
70
- yield item
68
+ yield data["items"]
71
69
 
72
- # Check if there's a next page
73
70
  if "next_page" not in data:
74
71
  break
75
72
 
@@ -88,7 +85,6 @@ def convert_timestamps_to_iso(
88
85
  """Convert timestamp fields from milliseconds to ISO format."""
89
86
  for field in timestamp_fields:
90
87
  if field in record and record[field] is not None:
91
- # Convert from milliseconds timestamp to ISO datetime string
92
88
  timestamp_ms = record[field]
93
89
  dt = pendulum.from_timestamp(timestamp_ms / 1000)
94
90
  record[field] = dt.to_iso8601_string()
@@ -177,87 +173,37 @@ async def _paginate_async(
177
173
  return items
178
174
 
179
175
 
180
- async def fetch_and_process_nested_resource_async(
181
- session: aiohttp.ClientSession,
182
- api_key: str,
183
- project_id: str,
184
- customer_id: str,
185
- customer: Dict[str, Any],
186
- resource_name: str,
187
- timestamp_fields: Optional[List[str]] = None,
188
- ) -> None:
189
- """
190
- Fetch and process any nested resource for a customer asynchronously.
191
-
192
- Args:
193
- session: aiohttp ClientSession
194
- api_key: RevenueCat API key
195
- project_id: Project ID
196
- customer_id: Customer ID
197
- customer: Customer data dictionary to modify
198
- resource_name: Name of the nested resource (e.g., 'purchases', 'subscriptions', 'events')
199
- timestamp_fields: List of timestamp fields to convert to ISO format
200
- """
201
- # If resource not included in customer data, fetch separately
202
- if resource_name not in customer or customer[resource_name] is None:
203
- endpoint = f"/projects/{project_id}/customers/{customer_id}/{resource_name}"
204
- customer[resource_name] = await _paginate_async(session, api_key, endpoint)
205
-
206
- # Convert timestamps if fields specified
207
- if (
208
- timestamp_fields
209
- and resource_name in customer
210
- and customer[resource_name] is not None
211
- ):
212
- for item in customer[resource_name]:
213
- convert_timestamps_to_iso(item, timestamp_fields)
214
-
215
-
216
176
  async def process_customer_with_nested_resources_async(
217
177
  session: aiohttp.ClientSession,
218
178
  api_key: str,
219
179
  project_id: str,
220
180
  customer: Dict[str, Any],
221
181
  ) -> Dict[str, Any]:
222
- """
223
- Process a customer and fetch nested resources concurrently.
224
-
225
- Args:
226
- session: aiohttp ClientSession
227
- api_key: RevenueCat API key
228
- project_id: Project ID
229
- customer: Customer data to process
230
-
231
- Returns:
232
- Customer data with nested resources populated
233
- """
234
182
  customer_id = customer["id"]
235
-
236
- # Convert customer timestamps
237
183
  customer = convert_timestamps_to_iso(customer, ["first_seen_at", "last_seen_at"])
238
-
239
- # Define nested resources to fetch concurrently
240
184
  nested_resources = [
241
185
  ("subscriptions", ["purchased_at", "expires_at", "grace_period_expires_at"]),
242
186
  ("purchases", ["purchased_at", "expires_at"]),
243
187
  ]
244
188
 
245
- # Create concurrent tasks for fetching nested resources
246
- tasks = []
247
- for resource_name, timestamp_fields in nested_resources:
248
- task = fetch_and_process_nested_resource_async(
249
- session,
250
- api_key,
251
- project_id,
252
- customer_id,
253
- customer,
254
- resource_name,
255
- timestamp_fields,
256
- )
257
- tasks.append(task)
258
-
259
- # Wait for all nested resources to be fetched
260
- await asyncio.gather(*tasks)
189
+ async def fetch_and_convert(resource_name, timestamp_fields):
190
+ if resource_name not in customer or customer[resource_name] is None:
191
+ endpoint = f"/projects/{project_id}/customers/{customer_id}/{resource_name}"
192
+ customer[resource_name] = await _paginate_async(session, api_key, endpoint)
193
+ if (
194
+ timestamp_fields
195
+ and resource_name in customer
196
+ and customer[resource_name] is not None
197
+ ):
198
+ for item in customer[resource_name]:
199
+ convert_timestamps_to_iso(item, timestamp_fields)
200
+
201
+ await asyncio.gather(
202
+ *[
203
+ fetch_and_convert(resource_name, timestamp_fields)
204
+ for resource_name, timestamp_fields in nested_resources
205
+ ]
206
+ )
261
207
 
262
208
  return customer
263
209
 
@@ -0,0 +1,83 @@
1
+ """A source loading data from Socrata open data platform"""
2
+
3
+ from typing import Any, Dict, Iterator, Optional
4
+
5
+ import dlt
6
+
7
+ from .helpers import fetch_data
8
+
9
+
10
+ @dlt.source(name="socrata", max_table_nesting=0)
11
+ def source(
12
+ domain: str,
13
+ dataset_id: str,
14
+ app_token: Optional[str] = None,
15
+ username: Optional[str] = None,
16
+ password: Optional[str] = None,
17
+ incremental: Optional[Any] = None,
18
+ primary_key: Optional[str] = None,
19
+ write_disposition: Optional[str] = dlt.config.value,
20
+ ):
21
+ """
22
+ A dlt source for the Socrata open data platform.
23
+
24
+ Supports both full refresh (replace) and incremental loading (merge).
25
+
26
+ Args:
27
+ domain: The Socrata domain (e.g., "evergreen.data.socrata.com")
28
+ dataset_id: The dataset identifier (e.g., "6udu-fhnu")
29
+ app_token: Socrata app token for higher rate limits (recommended)
30
+ username: Username for authentication (if dataset is private)
31
+ password: Password for authentication (if dataset is private)
32
+ incremental: DLT incremental object for incremental loading
33
+ primary_key: Primary key field for merge operations (default: ":id")
34
+ write_disposition: Write disposition ("replace", "append", "merge").
35
+ If not provided, automatically determined based on incremental setting.
36
+
37
+ Returns:
38
+ A dlt source with a single "dataset" resource
39
+ """
40
+
41
+ @dlt.resource(
42
+ write_disposition=write_disposition or "replace",
43
+ primary_key=primary_key, # type: ignore[call-overload]
44
+ )
45
+ def dataset(
46
+ incremental: Optional[dlt.sources.incremental] = incremental, # type: ignore[type-arg]
47
+ ) -> Iterator[Dict[str, Any]]:
48
+ """
49
+ Yields records from a Socrata dataset.
50
+
51
+ Supports both full refresh (replace) and incremental loading (merge).
52
+ When incremental is provided, filters data using SoQL WHERE clause on the server side.
53
+
54
+ Yields:
55
+ Dict[str, Any]: Individual records from the dataset
56
+ """
57
+ fetch_kwargs: Dict[str, Any] = {
58
+ "domain": domain,
59
+ "dataset_id": dataset_id,
60
+ "app_token": app_token,
61
+ "username": username,
62
+ "password": password,
63
+ }
64
+
65
+ if incremental and incremental.cursor_path:
66
+ fetch_kwargs["incremental_key"] = incremental.cursor_path
67
+ fetch_kwargs["start_value"] = (
68
+ str(incremental.last_value)
69
+ if incremental.last_value is not None
70
+ else None
71
+ )
72
+ if getattr(incremental, "end_value", None) is not None:
73
+ ev = incremental.end_value # type: ignore[attr-defined]
74
+ fetch_kwargs["end_value"] = (
75
+ ev.isoformat() # type: ignore[union-attr]
76
+ if hasattr(ev, "isoformat")
77
+ else str(ev)
78
+ )
79
+
80
+ # Fetch and yield records
81
+ yield from fetch_data(**fetch_kwargs)
82
+
83
+ return (dataset,)
@@ -0,0 +1,85 @@
1
+ """Socrata API helpers"""
2
+
3
+ from typing import Any, Dict, Iterator, Optional
4
+
5
+ from dlt.sources.helpers import requests
6
+
7
+ from .settings import DEFAULT_PAGE_SIZE, REQUEST_TIMEOUT
8
+
9
+
10
+ def fetch_data(
11
+ domain: str,
12
+ dataset_id: str,
13
+ app_token: Optional[str] = None,
14
+ username: Optional[str] = None,
15
+ password: Optional[str] = None,
16
+ incremental_key: Optional[str] = None,
17
+ start_value: Optional[str] = None,
18
+ end_value: Optional[str] = None,
19
+ ) -> Iterator[Dict[str, Any]]:
20
+ """
21
+ Fetch records from Socrata dataset with pagination and optional filtering.
22
+
23
+ Uses offset-based pagination to get all records, not just first 50000.
24
+ Supports incremental loading via SoQL WHERE clause for server-side filtering.
25
+
26
+ Args:
27
+ domain: Socrata domain (e.g., "data.seattle.gov")
28
+ dataset_id: Dataset identifier (e.g., "6udu-fhnu")
29
+ app_token: Socrata app token for higher rate limits
30
+ username: Username for authentication
31
+ password: Password for authentication
32
+ start_value: Minimum value for incremental_key (inclusive)
33
+ end_value: Maximum value for incremental_key (exclusive)
34
+
35
+ Yields:
36
+ Lists of records (one list per page)
37
+
38
+ Raises:
39
+ requests.HTTPError: If API request fails
40
+ """
41
+ url = f"https://{domain}/resource/{dataset_id}.json"
42
+
43
+ headers = {"Accept": "application/json"}
44
+ if app_token:
45
+ headers["X-App-Token"] = app_token
46
+
47
+ auth = (username, password) if username and password else None
48
+
49
+ limit = DEFAULT_PAGE_SIZE
50
+ offset = 0
51
+
52
+ while True:
53
+ params: Dict[str, Any] = {"$limit": limit, "$offset": offset}
54
+
55
+ if incremental_key and start_value:
56
+ start_value_iso = str(start_value).replace(" ", "T")
57
+ where_conditions = [f"{incremental_key} >= '{start_value_iso}'"]
58
+
59
+ if end_value:
60
+ end_value_iso = str(end_value).replace(" ", "T")
61
+ where_conditions.append(f"{incremental_key} < '{end_value_iso}'")
62
+
63
+ params["$where"] = " AND ".join(where_conditions)
64
+ params["$order"] = f"{incremental_key} ASC"
65
+
66
+ response = requests.get(
67
+ url,
68
+ headers=headers,
69
+ auth=auth,
70
+ params=params,
71
+ timeout=REQUEST_TIMEOUT,
72
+ )
73
+ response.raise_for_status()
74
+
75
+ data = response.json()
76
+
77
+ if not data:
78
+ break
79
+
80
+ yield data
81
+
82
+ if len(data) < limit:
83
+ break
84
+
85
+ offset += limit
@@ -0,0 +1,8 @@
1
+ """Socrata API settings and constants"""
2
+
3
+ # Request timeout in seconds
4
+ REQUEST_TIMEOUT = 30
5
+
6
+ # Maximum number of records to fetch per page
7
+ # Socrata API supports up to 50000 records per request
8
+ DEFAULT_PAGE_SIZE = 50000