PyPI - ingestr - Versions diffs - 0.14.93__py3-none-any.whl → 0.14.96__py3-none-any.whl - Mend

ingestr 0.14.93py3-none-any.whl → 0.14.96py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (15) hide show

ingestr/src/buildinfo.py +1 -1
ingestr/src/couchbase_source/__init__.py +118 -0
ingestr/src/couchbase_source/helpers.py +135 -0
ingestr/src/factory.py +4 -0
ingestr/src/revenuecat/__init__.py +16 -41
ingestr/src/revenuecat/helpers.py +19 -73
ingestr/src/socrata_source/__init__.py +83 -0
ingestr/src/socrata_source/helpers.py +85 -0
ingestr/src/socrata_source/settings.py +8 -0
ingestr/src/sources.py +257 -0
{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/METADATA +2 -1
{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/RECORD +15 -10
{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/WHEEL +0 -0
{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/entry_points.txt +0 -0
{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/buildinfo.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "v0.14.93"
1	+ version = "v0.14.96"

ingestr/src/couchbase_source/__init__.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""Source that loads data from Couchbase buckets, supports incremental loads."""
+from typing import Optional
+import dlt
+from dlt.sources import DltResource
+from .helpers import (
+    CouchbaseConfiguration,
+    client_from_credentials,
+    fetch_documents,
+)
+@dlt.source(max_table_nesting=0)
+def couchbase_source(
+    connection_string: str = dlt.secrets.value,
+    username: str = dlt.secrets.value,
+    password: str = dlt.secrets.value,
+    bucket: str = dlt.config.value,
+    scope: Optional[str] = dlt.config.value,
+    collection: Optional[str] = dlt.config.value,
+    incremental: Optional[dlt.sources.incremental] = None,  # type: ignore[type-arg]
+    write_disposition: Optional[str] = dlt.config.value,
+    limit: Optional[int] = None,
+) -> DltResource:
+    """
+    A DLT source which loads data from a Couchbase bucket using Couchbase Python SDK.
+    Args:
+        connection_string (str): Couchbase connection string (e.g., 'couchbase://localhost')
+        username (str): Couchbase username
+        password (str): Couchbase password
+        bucket (str): Bucket name to load data from
+        scope (Optional[str]): Scope name (defaults to '_default')
+        collection (Optional[str]): Collection name (defaults to '_default')
+        incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading.
+            E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
+        write_disposition (str): Write disposition of the resource.
+        limit (Optional[int]): The maximum number of documents to load.
+    Returns:
+        DltResource: A DLT resource for the Couchbase collection.
+    """
+    # Set up Couchbase client
+    cluster = client_from_credentials(connection_string, username, password)
+    resource_name = f"{bucket}_{scope}_{collection}"
+    return dlt.resource(  # type: ignore[call-overload, arg-type]
+        fetch_documents,
+        name=resource_name,
+        primary_key="id",
+        write_disposition=write_disposition or "replace",
+        spec=CouchbaseConfiguration,
+        max_table_nesting=0,
+    )(
+        cluster=cluster,
+        bucket_name=bucket,
+        scope_name=scope,
+        collection_name=collection,
+        incremental=incremental,
+        limit=limit,
+    )
+@dlt.resource(
+    name=lambda args: f"{args['bucket']}_{args['scope']}_{args['collection']}",
+    standalone=True,
+    spec=CouchbaseConfiguration,  # type: ignore[arg-type]
+)
+def couchbase_collection(
+    connection_string: str = dlt.secrets.value,
+    username: str = dlt.secrets.value,
+    password: str = dlt.secrets.value,
+    bucket: str = dlt.config.value,
+    scope: Optional[str] = dlt.config.value,
+    collection: Optional[str] = dlt.config.value,
+    incremental: Optional[dlt.sources.incremental] = None,  # type: ignore[type-arg]
+    write_disposition: Optional[str] = dlt.config.value,
+    limit: Optional[int] = None,
+    chunk_size: Optional[int] = 1000,
+) -> DltResource:
+    """
+    A DLT resource which loads a collection from Couchbase.
+    Args:
+        connection_string (str): Couchbase connection string (e.g., 'couchbase://localhost')
+        username (str): Couchbase username
+        password (str): Couchbase password
+        bucket (str): Bucket name to load data from
+        scope (Optional[str]): Scope name (defaults to '_default')
+        collection (Optional[str]): Collection name (defaults to '_default')
+        incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading.
+        write_disposition (str): Write disposition of the resource.
+        limit (Optional[int]): The maximum number of documents to load.
+        chunk_size (Optional[int]): The number of documents to load in each batch.
+    Returns:
+        DltResource: A DLT resource for the Couchbase collection.
+    """
+    # Set up Couchbase client
+    cluster = client_from_credentials(connection_string, username, password)
+    return dlt.resource(  # type: ignore[call-overload]
+        fetch_documents,
+        name=f"{bucket}_{scope}_{collection}",
+        primary_key="id",
+        write_disposition=write_disposition or "replace",
+    )(
+        cluster=cluster,
+        bucket_name=bucket,
+        scope_name=scope,
+        collection_name=collection,
+        incremental=incremental,
+        limit=limit,
+        chunk_size=chunk_size,
+    )

ingestr/src/couchbase_source/helpers.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""Helper functions for Couchbase source."""
+from datetime import datetime, timedelta
+from typing import Any, Dict, Iterator, Optional
+import dlt
+from couchbase.auth import PasswordAuthenticator  # type: ignore[import-untyped]
+from couchbase.cluster import Cluster  # type: ignore[import-untyped]
+from couchbase.options import (  # type: ignore[import-untyped]
+    ClusterOptions,
+    QueryOptions,
+)
+from dlt.common.configuration import configspec
+from dlt.common.time import ensure_pendulum_datetime
+@configspec
+class CouchbaseConfiguration:
+    """Configuration for Couchbase source."""
+    connection_string: str = dlt.secrets.value
+    username: str = dlt.secrets.value
+    password: str = dlt.secrets.value
+    bucket: str = dlt.config.value
+    scope: Optional[str] = dlt.config.value
+    collection: Optional[str] = dlt.config.value
+def client_from_credentials(
+    connection_string: str, username: str, password: str
+) -> Cluster:
+    """
+    Create a Couchbase cluster client from credentials.
+    Args:
+        connection_string: Couchbase connection string
+            - Local/self-hosted: 'couchbase://localhost'
+            - Capella (cloud): 'couchbases://your-instance.cloud.couchbase.com'
+        username: Couchbase username
+        password: Couchbase password
+    Returns:
+        Cluster: Connected Couchbase cluster instance
+    """
+    auth = PasswordAuthenticator(username, password)
+    options = ClusterOptions(auth)
+    # Apply wan_development profile for Capella (couchbases://) connections
+    # This helps avoid latency issues when accessing from different networks
+    if connection_string.startswith("couchbases://"):
+        options.apply_profile("wan_development")
+    cluster = Cluster(connection_string, options)
+    cluster.wait_until_ready(timedelta(seconds=30))
+    return cluster
+def fetch_documents(
+    cluster: Cluster,
+    bucket_name: str,
+    scope_name: str,
+    collection_name: str,
+    incremental: Optional[dlt.sources.incremental] = None,  # type: ignore[type-arg]
+    limit: Optional[int] = None,
+    chunk_size: Optional[int] = 1000,
+) -> Iterator[Dict[str, Any]]:
+    """
+    Fetch documents from a Couchbase collection using N1QL queries.
+    Args:
+        cluster: Couchbase cluster instance
+        bucket_name: Name of the bucket
+        scope_name: Name of the scope
+        collection_name: Name of the collection
+        incremental: Incremental loading configuration
+        limit: Maximum number of documents to fetch
+        chunk_size: Number of documents to fetch per batch
+    Yields:
+        Dict[str, Any]: Document data
+    """
+    # Build N1QL query with full path
+    full_collection_path = f"`{bucket_name}`.`{scope_name}`.`{collection_name}`"
+    n1ql_query = f"SELECT META().id as id, c.* FROM {full_collection_path} c"
+    # Add incremental filter if provided
+    if incremental and incremental.cursor_path:
+        where_clause = f" WHERE {incremental.cursor_path} >= $start_value"
+        if incremental.end_value is not None:
+            where_clause += f" AND {incremental.cursor_path} < $end_value"
+        n1ql_query += where_clause
+    # Add limit if provided
+    if limit:
+        n1ql_query += f" LIMIT {limit}"
+    # Execute query
+    try:
+        query_options = QueryOptions()
+        # Add parameters if incremental
+        if incremental and incremental.cursor_path:
+            named_parameters = {"start_value": incremental.last_value}
+            if incremental.end_value is not None:
+                named_parameters["end_value"] = incremental.end_value
+            query_options = QueryOptions(named_parameters=named_parameters)
+        result = cluster.query(n1ql_query, query_options)
+        # Yield documents
+        count = 0
+        for row in result:
+            doc = dict(row)
+            # Convert datetime fields to proper format
+            if (
+                incremental
+                and incremental.cursor_path
+                and incremental.cursor_path in doc
+            ):
+                cursor_value = doc[incremental.cursor_path]
+                if isinstance(cursor_value, (str, datetime)):
+                    doc[incremental.cursor_path] = ensure_pendulum_datetime(
+                        cursor_value
+                    )
+            yield doc
+            count += 1
+            if limit and count >= limit:
+                break
+    except Exception as e:
+        raise Exception(f"Error executing Couchbase N1QL query: {str(e)}")

ingestr/src/factory.py CHANGED Viewed

@@ -39,6 +39,7 @@ from ingestr.src.sources import (
     AttioSource,
     ChessSource,
     ClickupSource,
+    CouchbaseSource,
     DoceboSource,
     DynamoDBSource,
     ElasticsearchSource,
@@ -83,6 +84,7 @@ from ingestr.src.sources import (
     ShopifySource,
     SlackSource,
     SmartsheetSource,
+    SocrataSource,
     SolidgateSource,
     SqlSource,
     StripeAnalyticsSource,
@@ -160,6 +162,7 @@ class SourceDestinationFactory:
         "allium": AlliumSource,
         "anthropic": AnthropicSource,
         "csv": LocalCsvSource,
+        "couchbase": CouchbaseSource,
         "docebo": DoceboSource,
         "http": HttpSource,
         "https": HttpSource,
@@ -216,6 +219,7 @@ class SourceDestinationFactory:
         "sftp": SFTPSource,
         "pinterest": PinterestSource,
         "revenuecat": RevenueCatSource,
+        "socrata": SocrataSource,
         "zoom": ZoomSource,
         "clickup": ClickupSource,
         "influxdb": InfluxDBSource,

ingestr/src/revenuecat/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import asyncio
 from typing import Any, Dict, Iterable, Iterator
 import aiohttp
@@ -40,51 +39,26 @@ def revenuecat_source(
                 yield project
     @dlt.resource(
-        name="customers", primary_key="id", write_disposition="merge", parallelized=True
+        name="customer_ids",
+        write_disposition="replace",
+        selected=False,
+        parallelized=True,
     )
-    def customers() -> Iterator[Dict[str, Any]]:
-        """Get list of customers with nested purchases and subscriptions."""
+    def customer_ids():
         if project_id is None:
             raise ValueError("project_id is required for customers resource")
-        endpoint = f"/projects/{project_id}/customers"
-        async def process_customer_batch(customer_batch):
-            """Process a batch of customers with async operations."""
-            async with aiohttp.ClientSession() as session:
-                tasks = []
-                for customer in customer_batch:
-                    task = process_customer_with_nested_resources_async(
-                        session, api_key, project_id, customer
-                    )
-                    tasks.append(task)
+        yield _paginate(api_key, f"/projects/{project_id}/customers")
-                return await asyncio.gather(*tasks)
-        def process_customers_sync():
-            """Process customers in batches using asyncio."""
-            batch_size = 50  # Conservative batch size due to 60 req/min rate limit
-            current_batch = []
-            for customer in _paginate(api_key, endpoint):
-                current_batch.append(customer)
-                if len(current_batch) >= batch_size:
-                    # Process the batch asynchronously
-                    processed_customers = asyncio.run(
-                        process_customer_batch(current_batch)
-                    )
-                    for processed_customer in processed_customers:
-                        yield processed_customer
-                    current_batch = []
-            # Process any remaining customers in the final batch
-            if current_batch:
-                processed_customers = asyncio.run(process_customer_batch(current_batch))
-                for processed_customer in processed_customers:
-                    yield processed_customer
-        # Yield each processed customer
-        yield from process_customers_sync()
+    @dlt.transformer(
+        data_from=customer_ids, write_disposition="replace", parallelized=True
+    )
+    async def customers(customers) -> Iterator[Dict[str, Any]]:
+        async with aiohttp.ClientSession() as session:
+            for customer in customers:
+                yield await process_customer_with_nested_resources_async(
+                    session, api_key, project_id, customer
+                )
     # Create project-dependent resources dynamically
     project_resources = []
@@ -103,6 +77,7 @@ def revenuecat_source(
     return [
         projects,
+        customer_ids,
         customers,
         *project_resources,
     ]

ingestr/src/revenuecat/helpers.py CHANGED Viewed

@@ -64,12 +64,9 @@ def _paginate(
     while True:
         data = _make_request(api_key, endpoint, current_params)
-        # Yield items from the current page
         if "items" in data and data["items"] is not None:
-            for item in data["items"]:
-                yield item
+            yield data["items"]
-        # Check if there's a next page
         if "next_page" not in data:
             break
@@ -88,7 +85,6 @@ def convert_timestamps_to_iso(
     """Convert timestamp fields from milliseconds to ISO format."""
     for field in timestamp_fields:
         if field in record and record[field] is not None:
-            # Convert from milliseconds timestamp to ISO datetime string
             timestamp_ms = record[field]
             dt = pendulum.from_timestamp(timestamp_ms / 1000)
             record[field] = dt.to_iso8601_string()
@@ -177,87 +173,37 @@ async def _paginate_async(
     return items
-async def fetch_and_process_nested_resource_async(
-    session: aiohttp.ClientSession,
-    api_key: str,
-    project_id: str,
-    customer_id: str,
-    customer: Dict[str, Any],
-    resource_name: str,
-    timestamp_fields: Optional[List[str]] = None,
-) -> None:
-    """
-    Fetch and process any nested resource for a customer asynchronously.
-    Args:
-        session: aiohttp ClientSession
-        api_key: RevenueCat API key
-        project_id: Project ID
-        customer_id: Customer ID
-        customer: Customer data dictionary to modify
-        resource_name: Name of the nested resource (e.g., 'purchases', 'subscriptions', 'events')
-        timestamp_fields: List of timestamp fields to convert to ISO format
-    """
-    # If resource not included in customer data, fetch separately
-    if resource_name not in customer or customer[resource_name] is None:
-        endpoint = f"/projects/{project_id}/customers/{customer_id}/{resource_name}"
-        customer[resource_name] = await _paginate_async(session, api_key, endpoint)
-    # Convert timestamps if fields specified
-    if (
-        timestamp_fields
-        and resource_name in customer
-        and customer[resource_name] is not None
-    ):
-        for item in customer[resource_name]:
-            convert_timestamps_to_iso(item, timestamp_fields)
 async def process_customer_with_nested_resources_async(
     session: aiohttp.ClientSession,
     api_key: str,
     project_id: str,
     customer: Dict[str, Any],
 ) -> Dict[str, Any]:
-    """
-    Process a customer and fetch nested resources concurrently.
-    Args:
-        session: aiohttp ClientSession
-        api_key: RevenueCat API key
-        project_id: Project ID
-        customer: Customer data to process
-    Returns:
-        Customer data with nested resources populated
-    """
     customer_id = customer["id"]
-    # Convert customer timestamps
     customer = convert_timestamps_to_iso(customer, ["first_seen_at", "last_seen_at"])
-    # Define nested resources to fetch concurrently
     nested_resources = [
         ("subscriptions", ["purchased_at", "expires_at", "grace_period_expires_at"]),
         ("purchases", ["purchased_at", "expires_at"]),
     ]
-    # Create concurrent tasks for fetching nested resources
-    tasks = []
-    for resource_name, timestamp_fields in nested_resources:
-        task = fetch_and_process_nested_resource_async(
-            session,
-            api_key,
-            project_id,
-            customer_id,
-            customer,
-            resource_name,
-            timestamp_fields,
-        )
-        tasks.append(task)
-    # Wait for all nested resources to be fetched
-    await asyncio.gather(*tasks)
+    async def fetch_and_convert(resource_name, timestamp_fields):
+        if resource_name not in customer or customer[resource_name] is None:
+            endpoint = f"/projects/{project_id}/customers/{customer_id}/{resource_name}"
+            customer[resource_name] = await _paginate_async(session, api_key, endpoint)
+        if (
+            timestamp_fields
+            and resource_name in customer
+            and customer[resource_name] is not None
+        ):
+            for item in customer[resource_name]:
+                convert_timestamps_to_iso(item, timestamp_fields)
+    await asyncio.gather(
+        *[
+            fetch_and_convert(resource_name, timestamp_fields)
+            for resource_name, timestamp_fields in nested_resources
+        ]
+    )
     return customer

ingestr/src/socrata_source/__init__.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""A source loading data from Socrata open data platform"""
+from typing import Any, Dict, Iterator, Optional
+import dlt
+from .helpers import fetch_data
+@dlt.source(name="socrata", max_table_nesting=0)
+def source(
+    domain: str,
+    dataset_id: str,
+    app_token: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
+    incremental: Optional[Any] = None,
+    primary_key: Optional[str] = None,
+    write_disposition: Optional[str] = dlt.config.value,
+):
+    """
+    A dlt source for the Socrata open data platform.
+    Supports both full refresh (replace) and incremental loading (merge).
+    Args:
+        domain: The Socrata domain (e.g., "evergreen.data.socrata.com")
+        dataset_id: The dataset identifier (e.g., "6udu-fhnu")
+        app_token: Socrata app token for higher rate limits (recommended)
+        username: Username for authentication (if dataset is private)
+        password: Password for authentication (if dataset is private)
+        incremental: DLT incremental object for incremental loading
+        primary_key: Primary key field for merge operations (default: ":id")
+        write_disposition: Write disposition ("replace", "append", "merge").
+            If not provided, automatically determined based on incremental setting.
+    Returns:
+        A dlt source with a single "dataset" resource
+    """
+    @dlt.resource(
+        write_disposition=write_disposition or "replace",
+        primary_key=primary_key,  # type: ignore[call-overload]
+    )
+    def dataset(
+        incremental: Optional[dlt.sources.incremental] = incremental,  # type: ignore[type-arg]
+    ) -> Iterator[Dict[str, Any]]:
+        """
+        Yields records from a Socrata dataset.
+        Supports both full refresh (replace) and incremental loading (merge).
+        When incremental is provided, filters data using SoQL WHERE clause on the server side.
+        Yields:
+            Dict[str, Any]: Individual records from the dataset
+        """
+        fetch_kwargs: Dict[str, Any] = {
+            "domain": domain,
+            "dataset_id": dataset_id,
+            "app_token": app_token,
+            "username": username,
+            "password": password,
+        }
+        if incremental and incremental.cursor_path:
+            fetch_kwargs["incremental_key"] = incremental.cursor_path
+            fetch_kwargs["start_value"] = (
+                str(incremental.last_value)
+                if incremental.last_value is not None
+                else None
+            )
+            if getattr(incremental, "end_value", None) is not None:
+                ev = incremental.end_value  # type: ignore[attr-defined]
+                fetch_kwargs["end_value"] = (
+                    ev.isoformat()  # type: ignore[union-attr]
+                    if hasattr(ev, "isoformat")
+                    else str(ev)
+                )
+        # Fetch and yield records
+        yield from fetch_data(**fetch_kwargs)
+    return (dataset,)

ingestr/src/socrata_source/helpers.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Socrata API helpers"""
+from typing import Any, Dict, Iterator, Optional
+from dlt.sources.helpers import requests
+from .settings import DEFAULT_PAGE_SIZE, REQUEST_TIMEOUT
+def fetch_data(
+    domain: str,
+    dataset_id: str,
+    app_token: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
+    incremental_key: Optional[str] = None,
+    start_value: Optional[str] = None,
+    end_value: Optional[str] = None,
+) -> Iterator[Dict[str, Any]]:
+    """
+    Fetch records from Socrata dataset with pagination and optional filtering.
+    Uses offset-based pagination to get all records, not just first 50000.
+    Supports incremental loading via SoQL WHERE clause for server-side filtering.
+    Args:
+        domain: Socrata domain (e.g., "data.seattle.gov")
+        dataset_id: Dataset identifier (e.g., "6udu-fhnu")
+        app_token: Socrata app token for higher rate limits
+        username: Username for authentication
+        password: Password for authentication
+        start_value: Minimum value for incremental_key (inclusive)
+        end_value: Maximum value for incremental_key (exclusive)
+    Yields:
+        Lists of records (one list per page)
+    Raises:
+        requests.HTTPError: If API request fails
+    """
+    url = f"https://{domain}/resource/{dataset_id}.json"
+    headers = {"Accept": "application/json"}
+    if app_token:
+        headers["X-App-Token"] = app_token
+    auth = (username, password) if username and password else None
+    limit = DEFAULT_PAGE_SIZE
+    offset = 0
+    while True:
+        params: Dict[str, Any] = {"$limit": limit, "$offset": offset}
+        if incremental_key and start_value:
+            start_value_iso = str(start_value).replace(" ", "T")
+            where_conditions = [f"{incremental_key} >= '{start_value_iso}'"]
+            if end_value:
+                end_value_iso = str(end_value).replace(" ", "T")
+                where_conditions.append(f"{incremental_key} < '{end_value_iso}'")
+            params["$where"] = " AND ".join(where_conditions)
+            params["$order"] = f"{incremental_key} ASC"
+        response = requests.get(
+            url,
+            headers=headers,
+            auth=auth,
+            params=params,
+            timeout=REQUEST_TIMEOUT,
+        )
+        response.raise_for_status()
+        data = response.json()
+        if not data:
+            break
+        yield data
+        if len(data) < limit:
+            break
+        offset += limit

ingestr/src/socrata_source/settings.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Socrata API settings and constants"""
+# Request timeout in seconds
+REQUEST_TIMEOUT = 30
+# Maximum number of records to fetch per page
+# Socrata API supports up to 50000 records per request
+DEFAULT_PAGE_SIZE = 50000

ingestr/src/sources.py CHANGED Viewed

@@ -4066,3 +4066,260 @@ class AlliumSource:
             limit=limit,
             compute_profile=compute_profile,
         )
+class CouchbaseSource:
+    table_builder: Callable
+    def __init__(self, table_builder=None) -> None:
+        if table_builder is None:
+            from ingestr.src.couchbase_source import couchbase_collection
+            table_builder = couchbase_collection
+        self.table_builder = table_builder
+    def handles_incrementality(self) -> bool:
+        return False
+    def dlt_source(self, uri: str, table: str, **kwargs):
+        """
+        Create a dlt source for reading data from Couchbase.
+        URI formats:
+            - couchbase://username:password@host
+            - couchbase://username:password@host/bucket
+            - couchbase://username:password@host?ssl=true
+            - couchbases://username:password@host (SSL enabled)
+        Table formats:
+            - bucket.scope.collection (when bucket not in URI)
+            - scope.collection (when bucket specified in URI path)
+        Note: If password contains special characters (@, :, /, etc.), they must be URL-encoded.
+        Examples:
+            Local/Self-hosted:
+            - couchbase://admin:password123@localhost with table "mybucket.myscope.mycollection"
+            - couchbase://admin:password123@localhost/mybucket with table "myscope.mycollection"
+            - couchbase://admin:password123@localhost?ssl=true with table "mybucket._default._default"
+            Capella (Cloud):
+            - couchbases://user:pass@cb.xxx.cloud.couchbase.com with table "travel-sample.inventory.airport"
+            - couchbase://user:pass@cb.xxx.cloud.couchbase.com/travel-sample?ssl=true with table "inventory.airport"
+        To encode password in Python:
+            from urllib.parse import quote
+            encoded_pwd = quote("MyPass@123!", safe='')
+            uri = f"couchbase://admin:{encoded_pwd}@localhost?ssl=true"
+        Args:
+            uri: Couchbase connection URI (can include /bucket path and ?ssl=true query parameter)
+            table: Format depends on URI:
+                - bucket.scope.collection (if bucket not in URI)
+                - scope.collection (if bucket in URI path)
+            **kwargs: Additional arguments:
+                - limit: Maximum number of documents to fetch
+                - incremental_key: Field to use for incremental loading
+                - interval_start: Start value for incremental loading
+                - interval_end: End value for incremental loading
+        Returns:
+            DltResource for the Couchbase collection
+        """
+        # Parse the URI to extract connection details
+        # urlparse automatically decodes URL-encoded credentials
+        parsed = urlparse(uri)
+        # Extract username and password from URI
+        # Note: urlparse automatically decodes URL-encoded characters in username/password
+        from urllib.parse import unquote
+        username = parsed.username
+        password = unquote(parsed.password) if parsed.password else None
+        if not username or not password:
+            raise ValueError(
+                "Username and password must be provided in the URI.\n"
+                "Format: couchbase://username:password@host\n"
+                "If password has special characters (@, :, /), URL-encode them.\n"
+                "Example: couchbase://admin:MyPass%40123@localhost for password 'MyPass@123'"
+            )
+        # Reconstruct connection string without credentials
+        scheme = parsed.scheme
+        netloc = parsed.netloc
+        # Remove username:password@ from netloc if present
+        if "@" in netloc:
+            netloc = netloc.split("@", 1)[1]
+        # Parse query parameters from URI
+        from urllib.parse import parse_qs
+        query_params = parse_qs(parsed.query)
+        # Check if SSL is requested via URI query parameter (?ssl=true)
+        if "ssl" in query_params:
+            ssl_value = query_params["ssl"][0].lower()
+            use_ssl = ssl_value in ("true", "1", "yes")
+            # Apply SSL scheme based on parameter
+            if use_ssl and scheme == "couchbase":
+                scheme = "couchbases"
+        connection_string = f"{scheme}://{netloc}"
+        # Extract bucket from URI path if present (e.g., couchbase://host/bucket)
+        bucket_from_uri = None
+        if parsed.path and parsed.path.strip("/"):
+            bucket_from_uri = parsed.path.strip("/").split("/")[0]
+        # Parse table format: can be "scope.collection" or "bucket.scope.collection"
+        table_parts = table.split(".")
+        if len(table_parts) == 3:
+            # Format: bucket.scope.collection
+            bucket, scope, collection = table_parts
+        elif len(table_parts) == 2:
+            # Format: scope.collection (bucket from URI)
+            if bucket_from_uri:
+                bucket = bucket_from_uri
+                scope, collection = table_parts
+            else:
+                raise ValueError(
+                    "Table format is 'scope.collection' but no bucket specified in URI.\n"
+                    f"Either use URI format: couchbase://user:pass@host/bucket\n"
+                    f"Or use table format: bucket.scope.collection\n"
+                    f"Got table: {table}"
+                )
+        else:
+            raise ValueError(
+                "Table format must be 'bucket.scope.collection' or 'scope.collection' (with bucket in URI). "
+                f"Got: {table}\n"
+                "Examples:\n"
+                "  - URI: couchbase://user:pass@host, Table: travel-sample.inventory.airport\n"
+                "  - URI: couchbase://user:pass@host/travel-sample, Table: inventory.airport"
+            )
+        # Handle incremental loading
+        incremental = None
+        if kwargs.get("incremental_key"):
+            start_value = kwargs.get("interval_start")
+            end_value = kwargs.get("interval_end")
+            incremental = dlt_incremental(
+                kwargs.get("incremental_key", ""),
+                initial_value=start_value,
+                end_value=end_value,
+                range_end="closed",
+                range_start="closed",
+            )
+        # Get optional parameters
+        limit = kwargs.get("limit")
+        table_instance = self.table_builder(
+            connection_string=connection_string,
+            username=username,
+            password=password,
+            bucket=bucket,
+            scope=scope,
+            collection=collection,
+            incremental=incremental,
+            limit=limit,
+        )
+        table_instance.max_table_nesting = 1
+        return table_instance
+class SocrataSource:
+    def handles_incrementality(self) -> bool:
+        return False
+    def dlt_source(self, uri: str, table: str, **kwargs):
+        """
+        Creates a DLT source for Socrata open data platform.
+        URI format: socrata://domain?app_token=TOKEN
+        Table: dataset_id (e.g., "6udu-fhnu")
+        Args:
+            uri: Socrata connection URI with domain and optional auth params
+            table: Dataset ID (e.g., "6udu-fhnu")
+            **kwargs: Additional arguments:
+                - incremental_key: Field to use for incremental loading (e.g., ":updated_at")
+                - interval_start: Start date for initial load
+                - interval_end: End date for load
+                - primary_key: Primary key field for merge operations
+        Returns:
+            DltResource for the Socrata dataset
+        """
+        from urllib.parse import parse_qs, urlparse
+        parsed = urlparse(uri)
+        domain = parsed.netloc
+        if not domain:
+            raise ValueError(
+                "Domain must be provided in the URI.\n"
+                "Format: socrata://domain?app_token=TOKEN\n"
+                "Example: socrata://evergreen.data.socrata.com?app_token=mytoken"
+            )
+        query_params = parse_qs(parsed.query)
+        dataset_id = table
+        if not dataset_id:
+            raise ValueError(
+                "Dataset ID must be provided as the table parameter.\n"
+                "Example: --source-table 6udu-fhnu"
+            )
+        app_token = query_params.get("app_token", [None])[0]
+        username = query_params.get("username", [None])[0]
+        password = query_params.get("password", [None])[0]
+        incremental = None
+        if kwargs.get("incremental_key"):
+            start_value = kwargs.get("interval_start")
+            end_value = kwargs.get("interval_end")
+            if start_value:
+                start_value = (
+                    start_value.isoformat()
+                    if hasattr(start_value, "isoformat")
+                    else str(start_value)
+                )
+            if end_value:
+                end_value = (
+                    end_value.isoformat()
+                    if hasattr(end_value, "isoformat")
+                    else str(end_value)
+                )
+            incremental = dlt_incremental(
+                kwargs.get("incremental_key", ""),
+                initial_value=start_value,
+                end_value=end_value,
+                range_end="open",
+                range_start="closed",
+            )
+        primary_key = kwargs.get("primary_key")
+        from ingestr.src.socrata_source import source
+        return source(
+            domain=domain,
+            dataset_id=dataset_id,
+            app_token=app_token,
+            username=username,
+            password=password,
+            incremental=incremental,
+            primary_key=primary_key,
+        ).with_resources("dataset")

{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ingestr
-Version: 0.14.93
+Version: 0.14.96
 Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
 Project-URL: Homepage, https://github.com/bruin-data/ingestr
 Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -39,6 +39,7 @@ Requires-Dist: clickhouse-connect==0.8.14
 Requires-Dist: clickhouse-driver==0.2.9
 Requires-Dist: clickhouse-sqlalchemy==0.2.7
 Requires-Dist: confluent-kafka==2.8.0
+Requires-Dist: couchbase==4.3.6
 Requires-Dist: crate==2.0.0
 Requires-Dist: cryptography==44.0.2
 Requires-Dist: curlify==2.2.1

{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/RECORD RENAMED Viewed

@@ -2,17 +2,17 @@ ingestr/conftest.py,sha256=OE2yxeTCosS9CUFVuqNypm-2ftYvVBeeq7egm3878cI,1981
 ingestr/main.py,sha256=qo0g3wCFl8a_1jUwXagX8L1Q8PKKQlTF7md9pfnzW0Y,27155
 ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
 ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
-ingestr/src/buildinfo.py,sha256=gpczaxQtINGa_cWhMVJsfeFoUxh-gKIyba1YESpTmpk,21
+ingestr/src/buildinfo.py,sha256=-9qPR_WQg9aaTRg324DJAZs43V_FQHsRu9G9xDfXrjE,21
 ingestr/src/destinations.py,sha256=QtjE0AGs0WkPHaI2snWPHJ8HHi4lwXUBYLJPklz8Mvk,27772
 ingestr/src/errors.py,sha256=fhJ2BxOqOsBfOxuTDKfZblvawBrPG3x_1VikIxMZBRI,874
-ingestr/src/factory.py,sha256=k_8jgehOM2sHwCsjliYXmQhICl2B1UYoAs6vspjadv8,7770
+ingestr/src/factory.py,sha256=iFOFbwifvQf7qOtSoNPS6RGvAhsRaX7HzbjouHmSvfs,7882
 ingestr/src/filters.py,sha256=0n0sNAVG_f-B_1r7lW5iNtw9z_G1bxWzPaiL1i6tnbU,1665
 ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,732
 ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
 ingestr/src/masking.py,sha256=VN0LdfvExhQ1bZMRylGtaBUIoH-vjuIUmRnYKwo3yiY,11358
 ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
 ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
-ingestr/src/sources.py,sha256=D4VxA-yqilzTG0VBJBxnw9MUJ1Qeo2EpKjVGJfoMKoY,142289
+ingestr/src/sources.py,sha256=JVZf22XgIFXov3-yKOjsbQVw9cV_LrDeXD6eb4Z6jFk,151802
 ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
 ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
 ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -43,6 +43,8 @@ ingestr/src/chess/settings.py,sha256=p0RlCGgtXUacPDEvZmwzSWmzX0Apj1riwfz-nrMK89k
 ingestr/src/clickup/__init__.py,sha256=uvfAqNturT4bMvU4NS3E8BdL6nvDFzNuh7bMlih8HJk,2547
 ingestr/src/clickup/helpers.py,sha256=RzDKMUAHccuDhocIQ2ToBXfCERo8CBJqA3t-IPltBCE,1519
 ingestr/src/collector/spinner.py,sha256=_ZUqF5MI43hVIULdjF5s5mrAZbhEFXaiWirQmrv3Yk4,1201
+ingestr/src/couchbase_source/__init__.py,sha256=IPmb55mBxGWtt_9ywbY6chAwUp6jRmJTu-qEVFBhJ_s,4381
+ingestr/src/couchbase_source/helpers.py,sha256=RA0aFT0GfLJ2pHy7emvKmm0yVXgQOQ-hMVJvw-FExNo,4487
 ingestr/src/docebo/__init__.py,sha256=RBBjlt405PIIDOLEt78g9yBNJfhUMeJxR5DZD7oufXY,27543
 ingestr/src/docebo/client.py,sha256=nki0kNQhN8VDz5cdqlQQPhr1JMPlcNEYKnWK3umAyOc,15663
 ingestr/src/docebo/helpers.py,sha256=SaEjta6k3Lj-S5fvrheA5_xj7zfASMdOc_ihsqno5ko,3238
@@ -140,8 +142,8 @@ ingestr/src/plusvibeai/__init__.py,sha256=Uo-N2-1kbq5RJw8ym5tm8rqVchVbJJ2hOd6bws
 ingestr/src/plusvibeai/helpers.py,sha256=5hxxA2-XUtkZA1xrstZ39ilzUh4EouNDOiiL-NzGu9w,17939
 ingestr/src/plusvibeai/settings.py,sha256=3Hb7jcUNshSlGO4E27yUe_8n3f0VArX9XTmkTkN-Tvo,5366
 ingestr/src/quickbooks/__init__.py,sha256=cZUuVCOTGPHTscRj6i0DytO63_fWF-4ieMxoU4PcyTg,3727
-ingestr/src/revenuecat/__init__.py,sha256=5HbyZuEOekkbeeT72sM_bnGygSyYdmd_vczfAUz7xoM,4029
-ingestr/src/revenuecat/helpers.py,sha256=CYU6l79kplnfL87GfdxyGeEBrBSWEZfGP0GyjPHuVDk,9619
+ingestr/src/revenuecat/__init__.py,sha256=j75jkHBqd_9FsFKjsSLLwKrPcmUKOE3HJ95Qzonzmbk,2779
+ingestr/src/revenuecat/helpers.py,sha256=ej_bR6cuNOer4bTQfd_IuyMmt-xevcPgvRShKlxO8Xo,7998
 ingestr/src/salesforce/__init__.py,sha256=Ijveo8gyo_wLzQRBklxIm3RV0y2Gta9-mR44RbJljpI,4901
 ingestr/src/salesforce/helpers.py,sha256=QTdazBt-qRTBbCQMZnyclIaDQFmBixBy_RDKD00Lt-8,2492
 ingestr/src/shopify/__init__.py,sha256=RzSSG93g-Qlkz6TAxi1XasFDdxxtVXIo53ZTtjGczW4,62602
@@ -152,6 +154,9 @@ ingestr/src/slack/__init__.py,sha256=pyDukxcilqTAe_bBzfWJ8Vxi83S-XEdEFBH2pEgILrM
 ingestr/src/slack/helpers.py,sha256=08TLK7vhFvH_uekdLVOLF3bTDe1zgH0QxHObXHzk1a8,6545
 ingestr/src/slack/settings.py,sha256=NhKn4y1zokEa5EmIZ05wtj_-I0GOASXZ5V81M1zXCtY,457
 ingestr/src/smartsheets/__init__.py,sha256=RIEfN1T2TMFg8T0RvN4o6sqC58YusJRDrmE9Isos5P4,2375
+ingestr/src/socrata_source/__init__.py,sha256=K5DVpsVXTMfunZd5YoEsn1nipfo1zavFS59g3m2tsc8,2984
+ingestr/src/socrata_source/helpers.py,sha256=KbVojFSmMLXb0ajh8bhqfZfxDHH7rQ3nyI8p2jxVifA,2500
+ingestr/src/socrata_source/settings.py,sha256=DLfu-4HOa5nR7h9tbOySEa2ye3w_Z6TYZ9_zPqWaNQk,220
 ingestr/src/solidgate/__init__.py,sha256=Ts83j-JSnFsFuF4tDhVOfZKg7H0-bIpfn3kg1ZOR58A,8003
 ingestr/src/solidgate/helpers.py,sha256=mAsW_1hpD7ab3Y2vw8fxHi4yD3aT1geLdIYZ7ycyxBc,5690
 ingestr/src/sql_database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -184,8 +189,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
 ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
 ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
 ingestr/tests/unit/test_smartsheets.py,sha256=zf3DXT29Y4TH2lNPBFphdjlaelUUyPJcsW2UO68RzDs,4862
-ingestr-0.14.93.dist-info/METADATA,sha256=ttKTQKjoXX_xzXbQb2LisUnePWrFx5GXQf2dHCsG48g,15327
-ingestr-0.14.93.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-ingestr-0.14.93.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
-ingestr-0.14.93.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
-ingestr-0.14.93.dist-info/RECORD,,
+ingestr-0.14.96.dist-info/METADATA,sha256=vnkdaQVPvlnpHq9UgecuzRSSb_IiKE6_gS1jLkYzGEY,15359
+ingestr-0.14.96.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+ingestr-0.14.96.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
+ingestr-0.14.96.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
+ingestr-0.14.96.dist-info/RECORD,,

{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/WHEEL RENAMED Viewed

File without changes

{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ingestr-0.14.93.dist-info → ingestr-0.14.96.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

ingestr 0.14.93__py3-none-any.whl → 0.14.96__py3-none-any.whl

Potentially problematic release.

ingestr 0.14.93py3-none-any.whl → 0.14.96py3-none-any.whl