PyPI - ingestr - Versions diffs - 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl - Mend

ingestr 0.13.2py3-none-any.whl → 0.14.104py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

ingestr/conftest.py +72 -0
ingestr/main.py +134 -87
ingestr/src/adjust/__init__.py +4 -4
ingestr/src/adjust/adjust_helpers.py +7 -3
ingestr/src/airtable/__init__.py +3 -2
ingestr/src/allium/__init__.py +128 -0
ingestr/src/anthropic/__init__.py +277 -0
ingestr/src/anthropic/helpers.py +525 -0
ingestr/src/applovin/__init__.py +262 -0
ingestr/src/applovin_max/__init__.py +117 -0
ingestr/src/appsflyer/__init__.py +325 -0
ingestr/src/appsflyer/client.py +49 -45
ingestr/src/appstore/__init__.py +1 -0
ingestr/src/arrow/__init__.py +9 -1
ingestr/src/asana_source/__init__.py +1 -1
ingestr/src/attio/__init__.py +102 -0
ingestr/src/attio/helpers.py +65 -0
ingestr/src/blob.py +38 -11
ingestr/src/buildinfo.py +1 -0
ingestr/src/chess/__init__.py +1 -1
ingestr/src/clickup/__init__.py +85 -0
ingestr/src/clickup/helpers.py +47 -0
ingestr/src/collector/spinner.py +43 -0
ingestr/src/couchbase_source/__init__.py +118 -0
ingestr/src/couchbase_source/helpers.py +135 -0
ingestr/src/cursor/__init__.py +83 -0
ingestr/src/cursor/helpers.py +188 -0
ingestr/src/destinations.py +520 -33
ingestr/src/docebo/__init__.py +589 -0
ingestr/src/docebo/client.py +435 -0
ingestr/src/docebo/helpers.py +97 -0
ingestr/src/elasticsearch/__init__.py +80 -0
ingestr/src/elasticsearch/helpers.py +138 -0
ingestr/src/errors.py +8 -0
ingestr/src/facebook_ads/__init__.py +47 -28
ingestr/src/facebook_ads/helpers.py +59 -37
ingestr/src/facebook_ads/settings.py +2 -0
ingestr/src/facebook_ads/utils.py +39 -0
ingestr/src/factory.py +116 -2
ingestr/src/filesystem/__init__.py +8 -3
ingestr/src/filters.py +46 -3
ingestr/src/fluxx/__init__.py +9906 -0
ingestr/src/fluxx/helpers.py +209 -0
ingestr/src/frankfurter/__init__.py +157 -0
ingestr/src/frankfurter/helpers.py +48 -0
ingestr/src/freshdesk/__init__.py +89 -0
ingestr/src/freshdesk/freshdesk_client.py +137 -0
ingestr/src/freshdesk/settings.py +9 -0
ingestr/src/fundraiseup/__init__.py +95 -0
ingestr/src/fundraiseup/client.py +81 -0
ingestr/src/github/__init__.py +41 -6
ingestr/src/github/helpers.py +5 -5
ingestr/src/google_analytics/__init__.py +22 -4
ingestr/src/google_analytics/helpers.py +124 -6
ingestr/src/google_sheets/__init__.py +4 -4
ingestr/src/google_sheets/helpers/data_processing.py +2 -2
ingestr/src/hostaway/__init__.py +302 -0
ingestr/src/hostaway/client.py +288 -0
ingestr/src/http/__init__.py +35 -0
ingestr/src/http/readers.py +114 -0
ingestr/src/http_client.py +24 -0
ingestr/src/hubspot/__init__.py +66 -23
ingestr/src/hubspot/helpers.py +52 -22
ingestr/src/hubspot/settings.py +14 -7
ingestr/src/influxdb/__init__.py +46 -0
ingestr/src/influxdb/client.py +34 -0
ingestr/src/intercom/__init__.py +142 -0
ingestr/src/intercom/helpers.py +674 -0
ingestr/src/intercom/settings.py +279 -0
ingestr/src/isoc_pulse/__init__.py +159 -0
ingestr/src/jira_source/__init__.py +340 -0
ingestr/src/jira_source/helpers.py +439 -0
ingestr/src/jira_source/settings.py +170 -0
ingestr/src/kafka/__init__.py +4 -1
ingestr/src/kinesis/__init__.py +139 -0
ingestr/src/kinesis/helpers.py +82 -0
ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
ingestr/src/linear/__init__.py +634 -0
ingestr/src/linear/helpers.py +111 -0
ingestr/src/linkedin_ads/helpers.py +0 -1
ingestr/src/loader.py +69 -0
ingestr/src/mailchimp/__init__.py +126 -0
ingestr/src/mailchimp/helpers.py +226 -0
ingestr/src/mailchimp/settings.py +164 -0
ingestr/src/masking.py +344 -0
ingestr/src/mixpanel/__init__.py +62 -0
ingestr/src/mixpanel/client.py +99 -0
ingestr/src/monday/__init__.py +246 -0
ingestr/src/monday/helpers.py +392 -0
ingestr/src/monday/settings.py +328 -0
ingestr/src/mongodb/__init__.py +72 -8
ingestr/src/mongodb/helpers.py +915 -38
ingestr/src/partition.py +32 -0
ingestr/src/personio/__init__.py +331 -0
ingestr/src/personio/helpers.py +86 -0
ingestr/src/phantombuster/__init__.py +65 -0
ingestr/src/phantombuster/client.py +87 -0
ingestr/src/pinterest/__init__.py +82 -0
ingestr/src/pipedrive/__init__.py +198 -0
ingestr/src/pipedrive/helpers/__init__.py +23 -0
ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
ingestr/src/pipedrive/helpers/pages.py +115 -0
ingestr/src/pipedrive/settings.py +27 -0
ingestr/src/pipedrive/typing.py +3 -0
ingestr/src/plusvibeai/__init__.py +335 -0
ingestr/src/plusvibeai/helpers.py +544 -0
ingestr/src/plusvibeai/settings.py +252 -0
ingestr/src/quickbooks/__init__.py +117 -0
ingestr/src/resource.py +40 -0
ingestr/src/revenuecat/__init__.py +83 -0
ingestr/src/revenuecat/helpers.py +237 -0
ingestr/src/salesforce/__init__.py +156 -0
ingestr/src/salesforce/helpers.py +64 -0
ingestr/src/shopify/__init__.py +1 -17
ingestr/src/smartsheets/__init__.py +82 -0
ingestr/src/snapchat_ads/__init__.py +489 -0
ingestr/src/snapchat_ads/client.py +72 -0
ingestr/src/snapchat_ads/helpers.py +535 -0
ingestr/src/socrata_source/__init__.py +83 -0
ingestr/src/socrata_source/helpers.py +85 -0
ingestr/src/socrata_source/settings.py +8 -0
ingestr/src/solidgate/__init__.py +219 -0
ingestr/src/solidgate/helpers.py +154 -0
ingestr/src/sources.py +3132 -212
ingestr/src/stripe_analytics/__init__.py +49 -21
ingestr/src/stripe_analytics/helpers.py +286 -1
ingestr/src/stripe_analytics/settings.py +62 -10
ingestr/src/telemetry/event.py +10 -9
ingestr/src/tiktok_ads/__init__.py +12 -6
ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
ingestr/src/trustpilot/__init__.py +48 -0
ingestr/src/trustpilot/client.py +48 -0
ingestr/src/version.py +6 -1
ingestr/src/wise/__init__.py +68 -0
ingestr/src/wise/client.py +63 -0
ingestr/src/zoom/__init__.py +99 -0
ingestr/src/zoom/helpers.py +102 -0
ingestr/tests/unit/test_smartsheets.py +133 -0
ingestr-0.14.104.dist-info/METADATA +563 -0
ingestr-0.14.104.dist-info/RECORD +203 -0
ingestr/src/appsflyer/_init_.py +0 -24
ingestr-0.13.2.dist-info/METADATA +0 -302
ingestr-0.13.2.dist-info/RECORD +0 -107
{ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
{ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/snapchat_ads/helpers.py ADDED Viewed

@@ -0,0 +1,535 @@
+from typing import Iterator
+import requests
+from .client import SnapchatAdsAPI, create_client
+def client_side_date_filter(data: dict, start_date, end_date) -> bool:
+    """
+    Check if data item falls within the specified date range based on updated_at.
+    """
+    if not start_date and not end_date:
+        return True
+    from dlt.common.time import ensure_pendulum_datetime
+    updated_at_str = data.get("updated_at")
+    if not updated_at_str:
+        return True
+    updated_at = ensure_pendulum_datetime(updated_at_str)
+    if start_date and updated_at < ensure_pendulum_datetime(start_date):
+        return False
+    if end_date and updated_at > ensure_pendulum_datetime(end_date):
+        return False
+    return True
+def paginate(client: requests.Session, headers: dict, url: str, page_size: int = 1000):
+    """
+    Helper to paginate through Snapchat API responses.
+    """
+    from urllib.parse import parse_qs, urlparse
+    params: dict[str, int | str] = {"limit": page_size}
+    while url:
+        response = client.get(url, headers=headers, params=params)
+        response.raise_for_status()
+        result = response.json()
+        if result.get("request_status", "").upper() != "SUCCESS":
+            raise ValueError(
+                f"Request failed: {result.get('request_status')} - {result}"
+            )
+        yield result
+        # Check for next page
+        paging = result.get("paging", {})
+        next_link = paging.get("next_link")
+        if next_link:
+            # Extract cursor from next_link
+            parsed = urlparse(next_link)
+            query_params = parse_qs(parsed.query)
+            cursor_list = query_params.get("cursor", [None])
+            cursor = cursor_list[0] if cursor_list else None
+            if cursor:
+                params["cursor"] = cursor
+            else:
+                break
+        else:
+            break
+def get_account_ids(
+    api: "SnapchatAdsAPI",
+    ad_account_id: str | None,
+    organization_id: str | None,
+    base_url: str,
+    resource_name: str,
+    start_date=None,
+    end_date=None,
+) -> list[str]:
+    """
+    Get list of account IDs to fetch data for.
+    If ad_account_id is provided, returns a list with that single account.
+    Otherwise, fetches all ad accounts for the organization.
+    """
+    if ad_account_id:
+        return [ad_account_id]
+    if not organization_id:
+        raise ValueError(
+            f"organization_id is required to fetch {resource_name} for all ad accounts"
+        )
+    accounts_url = f"{base_url}/organizations/{organization_id}/adaccounts"
+    # Don't filter accounts by date - we want all accounts, then filter stats by date
+    accounts_data = list(
+        fetch_snapchat_data(api, accounts_url, "adaccounts", "adaccount", None, None)
+    )
+    return [
+        account_id
+        for account in accounts_data
+        if (account_id := account.get("id")) is not None
+    ]
+def fetch_snapchat_data(
+    api: "SnapchatAdsAPI",
+    url: str,
+    resource_key: str,
+    item_key: str,
+    start_date=None,
+    end_date=None,
+) -> Iterator[dict]:
+    """
+    Generic helper to fetch data from Snapchat API.
+    """
+    client = create_client()
+    headers = api.get_headers()
+    response = client.get(url, headers=headers)
+    response.raise_for_status()
+    result = response.json()
+    if result.get("request_status", "").upper() != "SUCCESS":
+        raise ValueError(f"Request failed: {result.get('request_status')} - {result}")
+    items_data = result.get(resource_key, [])
+    for item in items_data:
+        if item.get("sub_request_status", "").upper() == "SUCCESS":
+            data = item.get(item_key, {})
+            if data:
+                # Client-side filtering by updated_at
+                if client_side_date_filter(data, start_date, end_date):
+                    yield data
+def fetch_snapchat_data_with_params(
+    api: "SnapchatAdsAPI",
+    url: str,
+    resource_key: str,
+    item_key: str,
+    params: dict | None = None,
+) -> Iterator[dict]:
+    """
+    Generic helper to fetch data from Snapchat API with query parameters.
+    """
+    client = create_client()
+    headers = api.get_headers()
+    response = client.get(url, headers=headers, params=params or {})
+    response.raise_for_status()
+    result = response.json()
+    if result.get("request_status", "").upper() != "SUCCESS":
+        raise ValueError(f"Request failed: {result.get('request_status')} - {result}")
+    items_data = result.get(resource_key, [])
+    for item in items_data:
+        if item.get("sub_request_status", "").upper() == "SUCCESS":
+            data = item.get(item_key, {})
+            if data:
+                yield data
+def fetch_account_id_resource(
+    api: "SnapchatAdsAPI",
+    ad_account_id: str | None,
+    organization_id: str | None,
+    base_url: str,
+    resource_name: str,
+    item_key: str,
+    start_date=None,
+    end_date=None,
+) -> Iterator[dict]:
+    """
+    Fetch resource data for ad accounts without pagination.
+    If ad_account_id is provided, fetches data for that specific account.
+    Otherwise, fetches all ad accounts and then fetches data for each account.
+    """
+    account_ids = get_account_ids(
+        api,
+        ad_account_id,
+        organization_id,
+        base_url,
+        resource_name,
+        start_date,
+        end_date,
+    )
+    for account_id in account_ids:
+        url = f"{base_url}/adaccounts/{account_id}/{resource_name}"
+        yield from fetch_snapchat_data(
+            api, url, resource_name, item_key, start_date, end_date
+        )
+def fetch_with_paginate_account_id(
+    api: "SnapchatAdsAPI",
+    ad_account_id: str | None,
+    organization_id: str | None,
+    base_url: str,
+    resource_name: str,
+    item_key: str,
+    start_date=None,
+    end_date=None,
+) -> Iterator[dict]:
+    """
+    Fetch paginated resource data for ad accounts.
+    If ad_account_id is provided, fetches data for that specific account.
+    Otherwise, fetches all ad accounts and then fetches data for each account.
+    """
+    account_ids = get_account_ids(
+        api,
+        ad_account_id,
+        organization_id,
+        base_url,
+        resource_name,
+        start_date,
+        end_date,
+    )
+    client = create_client()
+    headers = api.get_headers()
+    for account_id in account_ids:
+        url = f"{base_url}/adaccounts/{account_id}/{resource_name}"
+        for result in paginate(client, headers, url, page_size=1000):
+            items_data = result.get(resource_name, [])
+            for item in items_data:
+                if item.get("sub_request_status", "").upper() == "SUCCESS":
+                    data = item.get(item_key, {})
+                    if data:
+                        if client_side_date_filter(data, start_date, end_date):
+                            yield data
+def build_stats_url(
+    base_url: str,
+    entity_type: str,
+    entity_id: str,
+) -> str:
+    """
+    Build the stats URL for a given entity type and ID.
+    Args:
+        base_url: Base API URL
+        entity_type: Type of entity (campaign, adsquad, ad, adaccount)
+        entity_id: ID of the entity
+    Returns:
+        Complete stats URL
+    """
+    entity_type_map = {
+        "campaign": "campaigns",
+        "adsquad": "adsquads",
+        "ad": "ads",
+        "adaccount": "adaccounts",
+    }
+    plural_entity = entity_type_map.get(entity_type)
+    if not plural_entity:
+        raise ValueError(
+            f"Invalid entity_type: {entity_type}. Must be one of: {list(entity_type_map.keys())}"
+        )
+    return f"{base_url}/{plural_entity}/{entity_id}/stats"
+def fetch_stats_data(
+    api: "SnapchatAdsAPI",
+    url: str,
+    params: dict,
+    granularity: str,
+) -> Iterator[dict]:
+    """
+    Fetch stats data from Snapchat API.
+    Args:
+        api: SnapchatAdsAPI instance
+        url: Stats endpoint URL
+        params: Query parameters
+        granularity: Granularity of stats (TOTAL, DAY, HOUR, LIFETIME)
+    Yields:
+        Flattened stats records
+    """
+    client = create_client()
+    headers = api.get_headers()
+    response = client.get(url, headers=headers, params=params)
+    if not response.ok:
+        raise ValueError(
+            f"Stats request failed: {response.status_code} - {response.text}"
+        )
+    response.raise_for_status()
+    result = response.json()
+    if result.get("request_status", "").upper() != "SUCCESS":
+        raise ValueError(f"Request failed: {result.get('request_status')} - {result}")
+    # Parse based on granularity
+    if granularity in ["TOTAL", "LIFETIME"]:
+        yield from parse_total_stats(result)
+    else:  # DAY or HOUR
+        yield from parse_timeseries_stats(result)
+def parse_total_stats(result: dict) -> Iterator[dict]:
+    """
+    Parse TOTAL or LIFETIME granularity stats response.
+    Args:
+        result: API response JSON
+    Yields:
+        Flattened stats records
+    """
+    # Handle both total_stats and lifetime_stats response formats
+    total_stats = result.get("total_stats", []) or result.get("lifetime_stats", [])
+    for stat_item in total_stats:
+        if stat_item.get("sub_request_status", "").upper() == "SUCCESS":
+            # Handle both total_stat and lifetime_stat keys
+            total_stat = stat_item.get("total_stat", {}) or stat_item.get(
+                "lifetime_stat", {}
+            )
+            if total_stat:
+                # Flatten the stats object
+                record = {
+                    "id": total_stat.get("id"),
+                    "type": total_stat.get("type"),
+                    "granularity": total_stat.get("granularity"),
+                    "start_time": total_stat.get("start_time"),
+                    "end_time": total_stat.get("end_time"),
+                    "finalized_data_end_time": total_stat.get(
+                        "finalized_data_end_time"
+                    ),
+                    "conversion_data_processed_end_time": total_stat.get(
+                        "conversion_data_processed_end_time"
+                    ),
+                    "swipe_up_attribution_window": total_stat.get(
+                        "swipe_up_attribution_window"
+                    ),
+                    "view_attribution_window": total_stat.get(
+                        "view_attribution_window"
+                    ),
+                }
+                # Flatten nested stats
+                stats = total_stat.get("stats", {})
+                for key, value in stats.items():
+                    record[key] = value
+                # Handle breakdown_stats if present
+                breakdown_stats = total_stat.get("breakdown_stats", {})
+                if breakdown_stats:
+                    for breakdown_type, breakdown_items in breakdown_stats.items():
+                        for item in breakdown_items:
+                            breakdown_record = record.copy()
+                            breakdown_record["breakdown_type"] = breakdown_type
+                            breakdown_record["breakdown_id"] = item.get("id")
+                            breakdown_record["breakdown_entity_type"] = item.get("type")
+                            item_stats = item.get("stats", {})
+                            for key, value in item_stats.items():
+                                breakdown_record[key] = value
+                            yield breakdown_record
+                else:
+                    yield record
+def parse_timeseries_stats(result: dict) -> Iterator[dict]:
+    """
+    Parse DAY or HOUR granularity stats response.
+    Args:
+        result: API response JSON
+    Yields:
+        Flattened stats records for each time period
+    """
+    timeseries_stats = result.get("timeseries_stats", [])
+    for stat_item in timeseries_stats:
+        if stat_item.get("sub_request_status", "").upper() == "SUCCESS":
+            timeseries_stat = stat_item.get("timeseries_stat", {})
+            if timeseries_stat:
+                entity_id = timeseries_stat.get("id")
+                entity_type = timeseries_stat.get("type")
+                granularity = timeseries_stat.get("granularity")
+                finalized_data_end_time = timeseries_stat.get("finalized_data_end_time")
+                conversion_data_processed_end_time = timeseries_stat.get(
+                    "conversion_data_processed_end_time"
+                )
+                swipe_up_attribution_window = timeseries_stat.get(
+                    "swipe_up_attribution_window"
+                )
+                view_attribution_window = timeseries_stat.get("view_attribution_window")
+                # Iterate through each time period
+                timeseries = timeseries_stat.get("timeseries", [])
+                for period in timeseries:
+                    record = {
+                        "id": entity_id,
+                        "type": entity_type,
+                        "granularity": granularity,
+                        "start_time": period.get("start_time"),
+                        "end_time": period.get("end_time"),
+                        "finalized_data_end_time": finalized_data_end_time,
+                        "conversion_data_processed_end_time": conversion_data_processed_end_time,
+                        "swipe_up_attribution_window": swipe_up_attribution_window,
+                        "view_attribution_window": view_attribution_window,
+                    }
+                    # Flatten nested stats
+                    stats = period.get("stats", {})
+                    for key, value in stats.items():
+                        record[key] = value
+                    yield record
+                # Handle breakdown_stats if present in timeseries
+                breakdown_stats = timeseries_stat.get("breakdown_stats", {})
+                if breakdown_stats:
+                    for breakdown_type, breakdown_items in breakdown_stats.items():
+                        for item in breakdown_items:
+                            item_timeseries = item.get("timeseries", [])
+                            for period in item_timeseries:
+                                breakdown_record = {
+                                    "id": entity_id,
+                                    "type": entity_type,
+                                    "granularity": granularity,
+                                    "start_time": period.get("start_time"),
+                                    "end_time": period.get("end_time"),
+                                    "finalized_data_end_time": finalized_data_end_time,
+                                    "conversion_data_processed_end_time": conversion_data_processed_end_time,
+                                    "swipe_up_attribution_window": swipe_up_attribution_window,
+                                    "view_attribution_window": view_attribution_window,
+                                    "breakdown_type": breakdown_type,
+                                    "breakdown_id": item.get("id"),
+                                    "breakdown_entity_type": item.get("type"),
+                                }
+                                item_stats = period.get("stats", {})
+                                for key, value in item_stats.items():
+                                    breakdown_record[key] = value
+                                yield breakdown_record
+def fetch_entity_stats(
+    api: "SnapchatAdsAPI",
+    entity_type: str,
+    ad_account_id: str | None,
+    organization_id: str | None,
+    base_url: str,
+    params: dict,
+    granularity: str,
+    start_date=None,
+    end_date=None,
+) -> Iterator[dict]:
+    """
+    Fetch stats for all entities of a given type.
+    First fetches all entities (campaigns, ads, adsquads, or adaccounts),
+    then fetches stats for each entity.
+    Args:
+        api: SnapchatAdsAPI instance
+        entity_type: Type of entity (campaign, adsquad, ad, adaccount)
+        ad_account_id: Specific ad account ID (optional)
+        organization_id: Organization ID (required if ad_account_id not provided)
+        base_url: Base API URL
+        params: Query parameters for stats request
+        granularity: Granularity of stats (TOTAL, DAY, HOUR, LIFETIME)
+        start_date: Start date for filtering entities
+        end_date: End date for filtering entities
+    Yields:
+        Flattened stats records
+    """
+    # Get account IDs
+    account_ids = get_account_ids(
+        api, ad_account_id, organization_id, base_url, "stats", start_date, end_date
+    )
+    if not account_ids:
+        return
+    if entity_type == "adaccount":
+        # For ad accounts, fetch stats directly for each account
+        for account_id in account_ids:
+            url = f"{base_url}/adaccounts/{account_id}/stats"
+            yield from fetch_stats_data(api, url, params, granularity)
+    else:
+        # For campaign, adsquad, ad - first fetch entities, then stats
+        entity_type_map = {
+            "campaign": ("campaigns", "campaign"),
+            "adsquad": ("adsquads", "adsquad"),
+            "ad": ("ads", "ad"),
+        }
+        resource_name, item_key = entity_type_map[entity_type]
+        client = create_client()
+        headers = api.get_headers()
+        for account_id in account_ids:
+            url = f"{base_url}/adaccounts/{account_id}/{resource_name}"
+            for result in paginate(client, headers, url, page_size=1000):
+                items_data = result.get(resource_name, [])
+                for item in items_data:
+                    if item.get("sub_request_status", "").upper() == "SUCCESS":
+                        data = item.get(item_key, {})
+                        if data and data.get("id"):
+                            entity_id = data["id"]
+                            stats_url = build_stats_url(
+                                base_url, entity_type, entity_id
+                            )
+                            yield from fetch_stats_data(
+                                api, stats_url, params, granularity
+                            )

ingestr/src/socrata_source/__init__.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""A source loading data from Socrata open data platform"""
+from typing import Any, Dict, Iterator, Optional
+import dlt
+from .helpers import fetch_data
+@dlt.source(name="socrata", max_table_nesting=0)
+def source(
+    domain: str,
+    dataset_id: str,
+    app_token: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
+    incremental: Optional[Any] = None,
+    primary_key: Optional[str] = None,
+    write_disposition: Optional[str] = dlt.config.value,
+):
+    """
+    A dlt source for the Socrata open data platform.
+    Supports both full refresh (replace) and incremental loading (merge).
+    Args:
+        domain: The Socrata domain (e.g., "evergreen.data.socrata.com")
+        dataset_id: The dataset identifier (e.g., "6udu-fhnu")
+        app_token: Socrata app token for higher rate limits (recommended)
+        username: Username for authentication (if dataset is private)
+        password: Password for authentication (if dataset is private)
+        incremental: DLT incremental object for incremental loading
+        primary_key: Primary key field for merge operations (default: ":id")
+        write_disposition: Write disposition ("replace", "append", "merge").
+            If not provided, automatically determined based on incremental setting.
+    Returns:
+        A dlt source with a single "dataset" resource
+    """
+    @dlt.resource(
+        write_disposition=write_disposition or "replace",
+        primary_key=primary_key,  # type: ignore[call-overload]
+    )
+    def dataset(
+        incremental: Optional[dlt.sources.incremental] = incremental,  # type: ignore[type-arg]
+    ) -> Iterator[Dict[str, Any]]:
+        """
+        Yields records from a Socrata dataset.
+        Supports both full refresh (replace) and incremental loading (merge).
+        When incremental is provided, filters data using SoQL WHERE clause on the server side.
+        Yields:
+            Dict[str, Any]: Individual records from the dataset
+        """
+        fetch_kwargs: Dict[str, Any] = {
+            "domain": domain,
+            "dataset_id": dataset_id,
+            "app_token": app_token,
+            "username": username,
+            "password": password,
+        }
+        if incremental and incremental.cursor_path:
+            fetch_kwargs["incremental_key"] = incremental.cursor_path
+            fetch_kwargs["start_value"] = (
+                str(incremental.last_value)
+                if incremental.last_value is not None
+                else None
+            )
+            if getattr(incremental, "end_value", None) is not None:
+                ev = incremental.end_value  # type: ignore[attr-defined]
+                fetch_kwargs["end_value"] = (
+                    ev.isoformat()  # type: ignore[union-attr]
+                    if hasattr(ev, "isoformat")
+                    else str(ev)
+                )
+        # Fetch and yield records
+        yield from fetch_data(**fetch_kwargs)
+    return (dataset,)

ingestr/src/socrata_source/helpers.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Socrata API helpers"""
+from typing import Any, Dict, Iterator, Optional
+from dlt.sources.helpers import requests
+from .settings import DEFAULT_PAGE_SIZE, REQUEST_TIMEOUT
+def fetch_data(
+    domain: str,
+    dataset_id: str,
+    app_token: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
+    incremental_key: Optional[str] = None,
+    start_value: Optional[str] = None,
+    end_value: Optional[str] = None,
+) -> Iterator[Dict[str, Any]]:
+    """
+    Fetch records from Socrata dataset with pagination and optional filtering.
+    Uses offset-based pagination to get all records, not just first 50000.
+    Supports incremental loading via SoQL WHERE clause for server-side filtering.
+    Args:
+        domain: Socrata domain (e.g., "data.seattle.gov")
+        dataset_id: Dataset identifier (e.g., "6udu-fhnu")
+        app_token: Socrata app token for higher rate limits
+        username: Username for authentication
+        password: Password for authentication
+        start_value: Minimum value for incremental_key (inclusive)
+        end_value: Maximum value for incremental_key (exclusive)
+    Yields:
+        Lists of records (one list per page)
+    Raises:
+        requests.HTTPError: If API request fails
+    """
+    url = f"https://{domain}/resource/{dataset_id}.json"
+    headers = {"Accept": "application/json"}
+    if app_token:
+        headers["X-App-Token"] = app_token
+    auth = (username, password) if username and password else None
+    limit = DEFAULT_PAGE_SIZE
+    offset = 0
+    while True:
+        params: Dict[str, Any] = {"$limit": limit, "$offset": offset}
+        if incremental_key and start_value:
+            start_value_iso = str(start_value).replace(" ", "T")
+            where_conditions = [f"{incremental_key} >= '{start_value_iso}'"]
+            if end_value:
+                end_value_iso = str(end_value).replace(" ", "T")
+                where_conditions.append(f"{incremental_key} < '{end_value_iso}'")
+            params["$where"] = " AND ".join(where_conditions)
+            params["$order"] = f"{incremental_key} ASC"
+        response = requests.get(
+            url,
+            headers=headers,
+            auth=auth,
+            params=params,
+            timeout=REQUEST_TIMEOUT,
+        )
+        response.raise_for_status()
+        data = response.json()
+        if not data:
+            break
+        yield data
+        if len(data) < limit:
+            break
+        offset += limit

ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

ingestr 0.13.2py3-none-any.whl → 0.14.104py3-none-any.whl