PyPI - ingestr - Versions diffs - 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl - Mend

ingestr 0.13.75py3-none-any.whl → 0.14.98py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (79) hide show

ingestr/main.py +22 -3
ingestr/src/adjust/__init__.py +4 -4
ingestr/src/allium/__init__.py +128 -0
ingestr/src/anthropic/__init__.py +277 -0
ingestr/src/anthropic/helpers.py +525 -0
ingestr/src/appstore/__init__.py +1 -0
ingestr/src/asana_source/__init__.py +1 -1
ingestr/src/buildinfo.py +1 -1
ingestr/src/chess/__init__.py +1 -1
ingestr/src/couchbase_source/__init__.py +118 -0
ingestr/src/couchbase_source/helpers.py +135 -0
ingestr/src/cursor/__init__.py +83 -0
ingestr/src/cursor/helpers.py +188 -0
ingestr/src/destinations.py +169 -1
ingestr/src/docebo/__init__.py +589 -0
ingestr/src/docebo/client.py +435 -0
ingestr/src/docebo/helpers.py +97 -0
ingestr/src/elasticsearch/helpers.py +138 -0
ingestr/src/errors.py +8 -0
ingestr/src/facebook_ads/__init__.py +26 -23
ingestr/src/facebook_ads/helpers.py +47 -1
ingestr/src/factory.py +48 -0
ingestr/src/filesystem/__init__.py +8 -3
ingestr/src/filters.py +9 -0
ingestr/src/fluxx/__init__.py +9906 -0
ingestr/src/fluxx/helpers.py +209 -0
ingestr/src/frankfurter/__init__.py +157 -163
ingestr/src/frankfurter/helpers.py +3 -3
ingestr/src/freshdesk/__init__.py +25 -8
ingestr/src/freshdesk/freshdesk_client.py +40 -5
ingestr/src/fundraiseup/__init__.py +49 -0
ingestr/src/fundraiseup/client.py +81 -0
ingestr/src/github/__init__.py +6 -4
ingestr/src/google_analytics/__init__.py +1 -1
ingestr/src/hostaway/__init__.py +302 -0
ingestr/src/hostaway/client.py +288 -0
ingestr/src/http/__init__.py +35 -0
ingestr/src/http/readers.py +114 -0
ingestr/src/hubspot/__init__.py +6 -12
ingestr/src/influxdb/__init__.py +1 -0
ingestr/src/intercom/__init__.py +142 -0
ingestr/src/intercom/helpers.py +674 -0
ingestr/src/intercom/settings.py +279 -0
ingestr/src/jira_source/__init__.py +340 -0
ingestr/src/jira_source/helpers.py +439 -0
ingestr/src/jira_source/settings.py +170 -0
ingestr/src/klaviyo/__init__.py +5 -5
ingestr/src/linear/__init__.py +553 -116
ingestr/src/linear/helpers.py +77 -38
ingestr/src/mailchimp/__init__.py +126 -0
ingestr/src/mailchimp/helpers.py +226 -0
ingestr/src/mailchimp/settings.py +164 -0
ingestr/src/masking.py +344 -0
ingestr/src/monday/__init__.py +246 -0
ingestr/src/monday/helpers.py +392 -0
ingestr/src/monday/settings.py +328 -0
ingestr/src/mongodb/__init__.py +5 -2
ingestr/src/mongodb/helpers.py +384 -10
ingestr/src/plusvibeai/__init__.py +335 -0
ingestr/src/plusvibeai/helpers.py +544 -0
ingestr/src/plusvibeai/settings.py +252 -0
ingestr/src/revenuecat/__init__.py +83 -0
ingestr/src/revenuecat/helpers.py +237 -0
ingestr/src/salesforce/__init__.py +15 -8
ingestr/src/shopify/__init__.py +1 -1
ingestr/src/smartsheets/__init__.py +33 -5
ingestr/src/socrata_source/__init__.py +83 -0
ingestr/src/socrata_source/helpers.py +85 -0
ingestr/src/socrata_source/settings.py +8 -0
ingestr/src/sources.py +1418 -54
ingestr/src/stripe_analytics/__init__.py +2 -19
ingestr/src/wise/__init__.py +68 -0
ingestr/src/wise/client.py +63 -0
ingestr/tests/unit/test_smartsheets.py +6 -9
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0

ingestr/main.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import warnings
 from datetime import datetime
 from enum import Enum
 from typing import Optional
@@ -8,6 +9,14 @@ from typing_extensions import Annotated
 from ingestr.src.telemetry.event import track
+try:
+    from duckdb_engine import DuckDBEngineWarning
+    warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
+except ImportError:
+    # duckdb-engine not installed
+    pass
 app = typer.Typer(
     name="ingestr",
     help="ingestr is the CLI tool to ingest data from one source to another",
@@ -273,6 +282,13 @@ def ingest(
             envvar=["STAGING_BUCKET", "INGESTR_STAGING_BUCKET"],
         ),
     ] = None,  # type: ignore
+    mask: Annotated[
+        Optional[list[str]],
+        typer.Option(
+            help="Column masking configuration in format 'column:algorithm[:param]'. Can be specified multiple times.",
+            envvar=["MASK", "INGESTR_MASK"],
+        ),
+    ] = [],  # type: ignore
 ):
     import hashlib
     import tempfile
@@ -293,6 +309,7 @@ def ingest(
     from ingestr.src.filters import (
         cast_set_to_list,
         cast_spanner_types,
+        create_masking_filter,
         handle_mysql_empty_dates,
     )
     from ingestr.src.sources import MongoDbSource
@@ -506,7 +523,6 @@ def ingest(
         if factory.source_scheme == "sqlite":
             source_table = "main." + source_table.split(".")[-1]
         if (
             incremental_key
@@ -554,6 +570,10 @@ def ingest(
         if factory.source_scheme.startswith("spanner"):
             resource.for_each(dlt_source, lambda x: x.add_map(cast_spanner_types))
+        if mask:
+            masking_filter = create_masking_filter(mask)
+            resource.for_each(dlt_source, lambda x: x.add_map(masking_filter))
         if yield_limit:
             resource.for_each(dlt_source, lambda x: x.add_limit(yield_limit))
@@ -600,10 +620,9 @@ def ingest(
         if factory.source_scheme == "influxdb":
             if primary_key:
                 write_disposition = "merge"
         start_time = datetime.now()
         run_info: LoadInfo = pipeline.run(
             dlt_source,
             **destination.dlt_run_params(

ingestr/src/adjust/__init__.py CHANGED Viewed

@@ -46,7 +46,7 @@ def adjust_source(
     filters: Optional[dict] = None,
 ) -> Sequence[DltResource]:
     @dlt.resource(write_disposition="merge", merge_key="day")
-    def campaigns():
+    def campaigns() -> DltResource:
         adjust_api = AdjustAPI(api_key=api_key)
         yield from adjust_api.fetch_report_data(
             start_date=start_date,
@@ -57,12 +57,12 @@ def adjust_source(
         )
     @dlt.resource(write_disposition="replace", primary_key="id")
-    def events():
+    def events() -> DltResource:
         adjust_api = AdjustAPI(api_key=api_key)
         yield adjust_api.fetch_events()
     @dlt.resource(write_disposition="merge", merge_key="day")
-    def creatives():
+    def creatives() -> DltResource:
         adjust_api = AdjustAPI(api_key=api_key)
         yield from adjust_api.fetch_report_data(
             start_date=start_date,
@@ -95,7 +95,7 @@ def adjust_source(
         primary_key=dimensions,
         columns=type_hints,
     )
-    def custom():
+    def custom() -> DltResource:
         adjust_api = AdjustAPI(api_key=api_key)
         yield from adjust_api.fetch_report_data(
             start_date=start_date,

ingestr/src/allium/__init__.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""
+Allium source for data extraction via REST API.
+This source provides access to Allium blockchain data via asynchronous query execution.
+"""
+import time
+from typing import Any, Iterator
+import dlt
+from ingestr.src.http_client import create_client
+@dlt.source(max_table_nesting=0, name="allium_source")
+def allium_source(
+    api_key: str,
+    query_id: str,
+    parameters: dict[str, Any] | None = None,
+    limit: int | None = None,
+    compute_profile: str | None = None,
+) -> Any:
+    """
+    Allium data source for blockchain data extraction.
+    This source connects to Allium API, runs async queries, and fetches results.
+    Args:
+        api_key: Allium API key for authentication
+        query_id: The query ID to execute (e.g., 'abc123')
+        parameters: Optional parameters for the query (e.g., {'start_date': '2025-02-01', 'end_date': '2025-02-02'})
+        limit: Limit the number of rows in the result (max 250,000)
+        compute_profile: Compute profile identifier
+    Yields:
+        DltResource: Data resources for Allium query results
+    """
+    base_url = "https://api.allium.so/api/v1/explorer"
+    session = create_client()
+    headers = {"X-API-Key": api_key}
+    @dlt.resource(
+        name="query_results",
+        write_disposition="replace",
+    )
+    def fetch_query_results() -> Iterator[dict[str, Any]]:
+        """
+        Fetch query results from Allium.
+        This function:
+        1. Starts an async query execution
+        2. Polls for completion status
+        3. Fetches and yields the results
+        """
+        # Step 1: Start async query execution
+        run_config: dict[str, Any] = {}
+        if limit is not None:
+            run_config["limit"] = limit
+        if compute_profile is not None:
+            run_config["compute_profile"] = compute_profile
+        run_payload = {"parameters": parameters or {}, "run_config": run_config}
+        run_response = session.post(
+            f"{base_url}/queries/{query_id}/run-async",
+            json=run_payload,
+            headers=headers,
+        )
+        run_data = run_response.json()
+        if "run_id" not in run_data:
+            raise ValueError(f"Failed to start query execution: {run_data}")
+        run_id = run_data["run_id"]
+        # Step 2: Poll for completion
+        max_retries = 8640  # Max 12 hours with 5-second intervals
+        retry_count = 0
+        poll_interval = 5  # seconds
+        while retry_count < max_retries:
+            status_response = session.get(
+                f"{base_url}/query-runs/{run_id}/status",
+                headers=headers,
+            )
+            status_response.raise_for_status()
+            status_data = status_response.json()
+            # Handle both string and dict responses
+            if isinstance(status_data, str):
+                status = status_data
+            else:
+                status = status_data.get("status")
+            if status == "success":
+                break
+            elif status == "failed":
+                error_msg = (
+                    status_data.get("error", "Unknown error")
+                    if isinstance(status_data, dict)
+                    else "Unknown error"
+                )
+                raise ValueError(f"Query execution failed: {error_msg}")
+            elif status in ["pending", "running", "queued"]:
+                time.sleep(poll_interval)
+                retry_count += 1
+            else:
+                raise ValueError(f"Unknown status: {status}")
+        if retry_count >= max_retries:
+            raise TimeoutError(
+                f"Query execution timed out after {max_retries * poll_interval} seconds"
+            )
+        # Step 3: Fetch results
+        results_response = session.get(
+            f"{base_url}/query-runs/{run_id}/results",
+            headers=headers,
+            params={"f": "json"},
+        )
+        results_response.raise_for_status()
+        query_output = results_response.json()
+        # Extract and yield all data
+        yield query_output.get("data", [])
+    return (fetch_query_results,)

ingestr/src/anthropic/__init__.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""Anthropic source for loading Claude Code usage analytics and other Anthropic API data."""
+from typing import Any, Dict, Iterator, Optional, Sequence
+import dlt
+import pendulum
+from dlt.sources import DltResource
+from .helpers import (
+    fetch_api_keys,
+    fetch_claude_code_usage,
+    fetch_cost_report,
+    fetch_invites,
+    fetch_organization_info,
+    fetch_usage_report,
+    fetch_users,
+    fetch_workspace_members,
+    fetch_workspaces,
+)
+@dlt.source(max_table_nesting=0)
+def anthropic_source(
+    api_key: str,
+    initial_start_date: Optional[pendulum.DateTime] = None,
+    end_date: Optional[pendulum.DateTime] = None,
+) -> Sequence[DltResource]:
+    """
+    Load data from Anthropic APIs.
+    Currently supports:
+    - Claude Code Usage Analytics
+    Args:
+        api_key: Anthropic Admin API key (starts with sk-ant-admin...)
+        initial_start_date: Start date for data retrieval (defaults to 2023-01-01)
+        end_date: Optional end date for data retrieval
+    Returns:
+        Sequence of DLT resources with Anthropic data
+    """
+    # Default start date to 2023-01-01 if not provided
+    start_date: pendulum.DateTime = (
+        initial_start_date
+        if initial_start_date is not None
+        else pendulum.datetime(2023, 1, 1)
+    )
+    # Prepare end_value for incremental
+    end_value_str = None
+    if end_date is not None:
+        end_value_str = end_date.to_date_string()
+    @dlt.resource(
+        name="claude_code_usage",
+        write_disposition="merge",
+        primary_key=["date", "actor_type", "actor_id", "terminal_type"],
+    )
+    def claude_code_usage(
+        date: dlt.sources.incremental[str] = dlt.sources.incremental(
+            "date",
+            initial_value=start_date.to_date_string(),
+            end_value=end_value_str,
+        ),
+    ) -> Iterator[Dict[str, Any]]:
+        """
+        Load Claude Code usage analytics data incrementally by date.
+        Yields flattened records with:
+        - date: The date of the usage data
+        - actor_type: Type of actor (user_actor or api_actor)
+        - actor_id: Email address or API key name
+        - organization_id: Organization UUID
+        - customer_type: api or subscription
+        - terminal_type: Terminal/environment type
+        - Core metrics (sessions, lines of code, commits, PRs)
+        - Tool actions (accepted/rejected counts by tool)
+        - Model usage and costs
+        """
+        # Get the date range from the incremental state
+        start_value = date.last_value if date.last_value else date.initial_value
+        start_date_parsed = (
+            pendulum.parse(start_value) if start_value else pendulum.now()
+        )
+        # Ensure we have a DateTime object
+        if isinstance(start_date_parsed, pendulum.DateTime):
+            start_date = start_date_parsed
+        elif isinstance(start_date_parsed, pendulum.Date):
+            start_date = pendulum.datetime(
+                start_date_parsed.year, start_date_parsed.month, start_date_parsed.day
+            )
+        else:
+            start_date = pendulum.now()
+        end_filter = pendulum.now()
+        if date.end_value:
+            end_filter_parsed = pendulum.parse(date.end_value)
+            # Ensure we have a DateTime object
+            if isinstance(end_filter_parsed, pendulum.DateTime):
+                end_filter = end_filter_parsed
+            elif isinstance(end_filter_parsed, pendulum.Date):
+                end_filter = pendulum.datetime(
+                    end_filter_parsed.year,
+                    end_filter_parsed.month,
+                    end_filter_parsed.day,
+                )
+        # Iterate through each day in the range
+        current_date = start_date
+        while current_date.date() <= end_filter.date():
+            # Fetch data for the current date
+            for record in fetch_claude_code_usage(
+                api_key, current_date.to_date_string()
+            ):
+                yield record
+            # Move to the next day
+            current_date = current_date.add(days=1)
+    @dlt.resource(
+        name="usage_report",
+        write_disposition="merge",
+        primary_key=["bucket", "api_key_id", "workspace_id", "model", "service_tier"],
+    )
+    def usage_report() -> Iterator[Dict[str, Any]]:
+        """
+        Load usage report data from the messages endpoint.
+        Yields records with token usage and server tool usage metrics.
+        """
+        # Convert dates to ISO format with timezone
+        start_iso = start_date.to_iso8601_string()
+        end_iso = (
+            end_date.to_iso8601_string()
+            if end_date
+            else pendulum.now().to_iso8601_string()
+        )
+        for record in fetch_usage_report(
+            api_key,
+            starting_at=start_iso,
+            ending_at=end_iso,
+            bucket_width="1h",  # Hourly buckets by default
+        ):
+            yield record
+    @dlt.resource(
+        name="cost_report",
+        write_disposition="merge",
+        primary_key=["bucket", "workspace_id", "description"],
+    )
+    def cost_report() -> Iterator[Dict[str, Any]]:
+        """
+        Load cost report data.
+        Yields records with cost breakdowns by workspace and description.
+        """
+        # Convert dates to ISO format with timezone
+        start_iso = start_date.to_iso8601_string()
+        end_iso = (
+            end_date.to_iso8601_string()
+            if end_date
+            else pendulum.now().to_iso8601_string()
+        )
+        for record in fetch_cost_report(
+            api_key,
+            starting_at=start_iso,
+            ending_at=end_iso,
+        ):
+            yield record
+    @dlt.resource(
+        name="organization",
+        write_disposition="replace",
+    )
+    def organization() -> Iterator[Dict[str, Any]]:
+        """
+        Load organization information.
+        Yields a single record with organization details.
+        """
+        org_info = fetch_organization_info(api_key)
+        if org_info:
+            yield org_info
+    @dlt.resource(
+        name="workspaces",
+        write_disposition="replace",
+        primary_key=["id"],
+    )
+    def workspaces() -> Iterator[Dict[str, Any]]:
+        """
+        Load all workspaces in the organization.
+        Yields records with workspace details including name, type, and creation date.
+        """
+        for workspace in fetch_workspaces(api_key):
+            yield workspace
+    @dlt.resource(
+        name="api_keys",
+        write_disposition="replace",
+        primary_key=["id"],
+    )
+    def api_keys() -> Iterator[Dict[str, Any]]:
+        """
+        Load all API keys in the organization.
+        Yields records with API key details including name, status, and creation date.
+        """
+        for api_key_record in fetch_api_keys(api_key):
+            yield api_key_record
+    @dlt.resource(
+        name="invites",
+        write_disposition="replace",
+        primary_key=["id"],
+    )
+    def invites() -> Iterator[Dict[str, Any]]:
+        """
+        Load all pending invites in the organization.
+        Yields records with invite details including email, role, and expiration.
+        """
+        for invite in fetch_invites(api_key):
+            yield invite
+    @dlt.resource(
+        name="users",
+        write_disposition="replace",
+        primary_key=["id"],
+    )
+    def users() -> Iterator[Dict[str, Any]]:
+        """
+        Load all users in the organization.
+        Yields records with user details including email, name, and role.
+        """
+        for user in fetch_users(api_key):
+            yield user
+    @dlt.resource(
+        name="workspace_members",
+        write_disposition="replace",
+        primary_key=["workspace_id", "user_id"],
+    )
+    def workspace_members() -> Iterator[Dict[str, Any]]:
+        """
+        Load workspace members for all workspaces.
+        Yields records with workspace membership details.
+        """
+        # First get all workspaces
+        for workspace in fetch_workspaces(api_key):
+            workspace_id = workspace.get("id")
+            if workspace_id:
+                # Get members for each workspace
+                for member in fetch_workspace_members(api_key, workspace_id):
+                    yield member
+    return [
+        claude_code_usage,
+        usage_report,
+        cost_report,
+        organization,
+        workspaces,
+        api_keys,
+        invites,
+        users,
+        workspace_members,
+    ]

ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.75py3-none-any.whl → 0.14.98py3-none-any.whl