PyPI - ingestr - Versions diffs - 0.13.20__py3-none-any.whl → 0.13.22__py3-none-any.whl - Mend

ingestr 0.13.20py3-none-any.whl → 0.13.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (20) hide show

ingestr/main.py +29 -25
ingestr/src/adjust/adjust_helpers.py +6 -2
ingestr/src/applovin_max/__init__.py +5 -3
ingestr/src/buildinfo.py +1 -1
ingestr/src/factory.py +2 -0
ingestr/src/hubspot/__init__.py +0 -1
ingestr/src/kinesis/__init__.py +3 -4
ingestr/src/partition.py +2 -2
ingestr/src/pipedrive/__init__.py +198 -0
ingestr/src/pipedrive/helpers/__init__.py +23 -0
ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
ingestr/src/pipedrive/helpers/pages.py +115 -0
ingestr/src/pipedrive/settings.py +27 -0
ingestr/src/pipedrive/typing.py +3 -0
ingestr/src/sources.py +46 -14
{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/METADATA +5 -5
{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/RECORD +20 -14
{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/WHEEL +0 -0
{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/licenses/LICENSE.md +0 -0

ingestr/main.py CHANGED Viewed

@@ -127,40 +127,44 @@ class SqlReflectionLevel(str, Enum):
 def ingest(
     source_uri: Annotated[
         str,
-        typer.Option(help="The URI of the [green]source[/green]", envvar="SOURCE_URI"),
+        typer.Option(
+            help="The URI of the [green]source[/green]",
+            envvar=["SOURCE_URI", "INGESTR_SOURCE_URI"],
+        ),
     ],  # type: ignore
     dest_uri: Annotated[
         str,
         typer.Option(
-            help="The URI of the [cyan]destination[/cyan]", envvar="DESTINATION_URI"
+            help="The URI of the [cyan]destination[/cyan]",
+            envvar=["DESTINATION_URI", "INGESTR_DESTINATION_URI"],
         ),
     ],  # type: ignore
     source_table: Annotated[
         str,
         typer.Option(
             help="The table name in the [green]source[/green] to fetch",
-            envvar="SOURCE_TABLE",
+            envvar=["SOURCE_TABLE", "INGESTR_SOURCE_TABLE"],
         ),
     ],  # type: ignore
     dest_table: Annotated[
         str,
         typer.Option(
             help="The table in the [cyan]destination[/cyan] to save the data into",
-            envvar="DESTINATION_TABLE",
+            envvar=["DESTINATION_TABLE", "INGESTR_DESTINATION_TABLE"],
         ),
     ] = None,  # type: ignore
     incremental_key: Annotated[
         Optional[str],
         typer.Option(
             help="The incremental key from the table to be used for incremental strategies",
-            envvar="INCREMENTAL_KEY",
+            envvar=["INCREMENTAL_KEY", "INGESTR_INCREMENTAL_KEY"],
         ),
     ] = None,  # type: ignore
     incremental_strategy: Annotated[
         IncrementalStrategy,
         typer.Option(
             help="The incremental strategy to use",
-            envvar="INCREMENTAL_STRATEGY",
+            envvar=["INCREMENTAL_STRATEGY", "INGESTR_INCREMENTAL_STRATEGY"],
         ),
     ] = IncrementalStrategy.create_replace,  # type: ignore
     interval_start: Annotated[
@@ -168,7 +172,7 @@ def ingest(
         typer.Option(
             help="The start of the interval the incremental key will cover",
             formats=DATE_FORMATS,
-            envvar="INTERVAL_START",
+            envvar=["INTERVAL_START", "INGESTR_INTERVAL_START"],
         ),
     ] = None,  # type: ignore
     interval_end: Annotated[
@@ -176,126 +180,126 @@ def ingest(
         typer.Option(
             help="The end of the interval the incremental key will cover",
             formats=DATE_FORMATS,
-            envvar="INTERVAL_END",
+            envvar=["INTERVAL_END", "INGESTR_INTERVAL_END"],
         ),
     ] = None,  # type: ignore
     primary_key: Annotated[
         Optional[list[str]],
         typer.Option(
             help="The key that will be used to deduplicate the resulting table",
-            envvar="PRIMARY_KEY",
+            envvar=["PRIMARY_KEY", "INGESTR_PRIMARY_KEY"],
         ),
     ] = None,  # type: ignore
     partition_by: Annotated[
         Optional[str],
         typer.Option(
             help="The partition key to be used for partitioning the destination table",
-            envvar="PARTITION_BY",
+            envvar=["PARTITION_BY", "INGESTR_PARTITION_BY"],
         ),
     ] = None,  # type: ignore
     cluster_by: Annotated[
         Optional[str],
         typer.Option(
             help="The clustering key to be used for clustering the destination table, not every destination supports clustering.",
-            envvar="CLUSTER_BY",
+            envvar=["CLUSTER_BY", "INGESTR_CLUSTER_BY"],
         ),
     ] = None,  # type: ignore
     yes: Annotated[
         Optional[bool],
         typer.Option(
             help="Skip the confirmation prompt and ingest right away",
-            envvar="SKIP_CONFIRMATION",
+            envvar=["SKIP_CONFIRMATION", "INGESTR_SKIP_CONFIRMATION"],
         ),
     ] = False,  # type: ignore
     full_refresh: Annotated[
         bool,
         typer.Option(
             help="Ignore the state and refresh the destination table completely",
-            envvar="FULL_REFRESH",
+            envvar=["FULL_REFRESH", "INGESTR_FULL_REFRESH"],
         ),
     ] = False,  # type: ignore
     progress: Annotated[
         Progress,
         typer.Option(
             help="The progress display type, must be one of 'interactive', 'log'",
-            envvar="PROGRESS",
+            envvar=["PROGRESS", "INGESTR_PROGRESS"],
         ),
     ] = Progress.interactive,  # type: ignore
     sql_backend: Annotated[
         SqlBackend,
         typer.Option(
             help="The SQL backend to use",
-            envvar="SQL_BACKEND",
+            envvar=["SQL_BACKEND", "INGESTR_SQL_BACKEND"],
         ),
     ] = SqlBackend.pyarrow,  # type: ignore
     loader_file_format: Annotated[
         Optional[LoaderFileFormat],
         typer.Option(
             help="The file format to use when loading data",
-            envvar="LOADER_FILE_FORMAT",
+            envvar=["LOADER_FILE_FORMAT", "INGESTR_LOADER_FILE_FORMAT"],
         ),
     ] = None,  # type: ignore
     page_size: Annotated[
         Optional[int],
         typer.Option(
             help="The page size to be used when fetching data from SQL sources",
-            envvar="PAGE_SIZE",
+            envvar=["PAGE_SIZE", "INGESTR_PAGE_SIZE"],
         ),
     ] = 50000,  # type: ignore
     loader_file_size: Annotated[
         Optional[int],
         typer.Option(
             help="The file size to be used by the loader to split the data into multiple files. This can be set independent of the page size, since page size is used for fetching the data from the sources whereas this is used for the processing/loading part.",
-            envvar="LOADER_FILE_SIZE",
+            envvar=["LOADER_FILE_SIZE", "INGESTR_LOADER_FILE_SIZE"],
         ),
     ] = 100000,  # type: ignore
     schema_naming: Annotated[
         SchemaNaming,
         typer.Option(
             help="The naming convention to use when moving the tables from source to destination. The default behavior is explained here: https://dlthub.com/docs/general-usage/schema#naming-convention",
-            envvar="SCHEMA_NAMING",
+            envvar=["SCHEMA_NAMING", "INGESTR_SCHEMA_NAMING"],
         ),
     ] = SchemaNaming.default,  # type: ignore
     pipelines_dir: Annotated[
         Optional[str],
         typer.Option(
             help="The path to store dlt-related pipeline metadata. By default, ingestr will create a temporary directory and delete it after the execution is done in order to make retries stateless.",
-            envvar="PIPELINES_DIR",
+            envvar=["PIPELINES_DIR", "INGESTR_PIPELINES_DIR"],
         ),
     ] = None,  # type: ignore
     extract_parallelism: Annotated[
         Optional[int],
         typer.Option(
             help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
-            envvar="EXTRACT_PARALLELISM",
+            envvar=["EXTRACT_PARALLELISM", "INGESTR_EXTRACT_PARALLELISM"],
         ),
     ] = 5,  # type: ignore
     sql_reflection_level: Annotated[
         SqlReflectionLevel,
         typer.Option(
             help="The reflection level to use when reflecting the table schema from the source",
-            envvar="SQL_REFLECTION_LEVEL",
+            envvar=["SQL_REFLECTION_LEVEL", "INGESTR_SQL_REFLECTION_LEVEL"],
         ),
     ] = SqlReflectionLevel.full,  # type: ignore
     sql_limit: Annotated[
         Optional[int],
         typer.Option(
             help="The limit to use when fetching data from the source",
-            envvar="SQL_LIMIT",
+            envvar=["SQL_LIMIT", "INGESTR_SQL_LIMIT"],
         ),
     ] = None,  # type: ignore
     sql_exclude_columns: Annotated[
         Optional[list[str]],
         typer.Option(
             help="The columns to exclude from the source table",
-            envvar="SQL_EXCLUDE_COLUMNS",
+            envvar=["SQL_EXCLUDE_COLUMNS", "INGESTR_SQL_EXCLUDE_COLUMNS"],
         ),
     ] = [],  # type: ignore
     columns: Annotated[
         Optional[list[str]],
         typer.Option(
             help="The column types to be used for the destination table in the format of 'column_name:column_type'",
-            envvar="COLUMNS",
+            envvar=["COLUMNS", "INGESTR_COLUMNS"],
         ),
     ] = None,  # type: ignore
 ):

ingestr/src/adjust/adjust_helpers.py CHANGED Viewed

@@ -82,7 +82,9 @@ class AdjustAPI:
             items = result.get("rows", [])
             yield items
         else:
-            raise HTTPError(f"Request failed with status code: {response.status_code}, {response.text}.")
+            raise HTTPError(
+                f"Request failed with status code: {response.status_code}, {response.text}."
+            )
     def fetch_events(self):
         headers = {"Authorization": f"Bearer {self.api_key}"}
@@ -93,7 +95,9 @@ class AdjustAPI:
             result = response.json()
             yield result
         else:
-            raise HTTPError(f"Request failed with status code: {response.status_code}, {response.text}.")
+            raise HTTPError(
+                f"Request failed with status code: {response.status_code}, {response.text}."
+            )
 def parse_filters(filters_raw: str) -> dict:

ingestr/src/applovin_max/__init__.py CHANGED Viewed

@@ -105,11 +105,13 @@ def get_data(
         if response.status_code == 404:
             if "No Mediation App Id found for platform" in response.text:
                 return None
-        error_message = f"AppLovin MAX API error (status {response.status_code}): {response.text}"
+        error_message = (
+            f"AppLovin MAX API error (status {response.status_code}): {response.text}"
+        )
         raise requests.HTTPError(error_message)
     response_url = response.json().get("ad_revenue_report_url")
     df = pd.read_csv(response_url)
     df["Date"] = pd.to_datetime(df["Date"])
     df["partition_date"] = df["Date"].dt.date
-    return df
+    return df

ingestr/src/buildinfo.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "v0.13.20"
1	+ version = "v0.13.22"

ingestr/src/factory.py CHANGED Viewed

@@ -43,6 +43,7 @@ from ingestr.src.sources import (
     MongoDbSource,
     NotionSource,
     PersonioSource,
+    PipedriveSource,
     S3Source,
     SalesforceSource,
     ShopifySource,
@@ -144,6 +145,7 @@ class SourceDestinationFactory:
         "salesforce": SalesforceSource,
         "personio": PersonioSource,
         "kinesis": KinesisSource,
+        "pipedrive": PipedriveSource,
     }
     destinations: Dict[str, Type[DestinationProtocol]] = {
         "bigquery": BigQueryDestination,

ingestr/src/hubspot/__init__.py CHANGED Viewed

@@ -199,7 +199,6 @@ def crm_objects(
     props = ",".join(sorted(list(set(props))))
     params = {"properties": props, "limit": 100}
     yield from fetch_data(CRM_OBJECT_ENDPOINTS[object_type], api_key, params=params)

ingestr/src/kinesis/__init__.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .helpers import get_shard_iterator, max_sequence_by_shard
     name=lambda args: args["stream_name"],
     primary_key="kinesis_msg_id",
     standalone=True,
-    max_table_nesting=0
+    max_table_nesting=0,
 )
 def kinesis_stream(
     stream_name: str,
@@ -75,7 +75,6 @@ def kinesis_stream(
     # get next shard to fetch messages from
     while shard_id := shard_ids.pop(0) if shard_ids else None:
         shard_iterator, _ = get_shard_iterator(
             kinesis_client,
             stream_name,
@@ -83,14 +82,14 @@ def kinesis_stream(
             last_msg,  # type: ignore
             initial_at_datetime,  # type: ignore
         )
         while shard_iterator:
             records = []
             records_response = kinesis_client.get_records(
                 ShardIterator=shard_iterator,
                 Limit=chunk_size,  # The size of data can be up to 1 MB, it must be controlled by the user
             )
             for record in records_response["Records"]:
                 sequence_number = record["SequenceNumber"]
                 content = record["Data"]

ingestr/src/partition.py CHANGED Viewed

@@ -13,7 +13,6 @@ def apply_athena_hints(
     additional_hints: Dict[str, TColumnSchema] = {},
 ) -> None:
     def _apply_partition_hint(resource: DltResource) -> None:
         columns = resource.columns if resource.columns else {}
         partition_hint = (
@@ -24,7 +23,8 @@ def apply_athena_hints(
         athena_adapter(
             resource,
             athena_partition.day(partition_column)
-            if partition_hint and partition_hint.get("data_type") in ("timestamp", "date")
+            if partition_hint
+            and partition_hint.get("data_type") in ("timestamp", "date")
             else partition_column,
         )

ingestr/src/pipedrive/__init__.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""Highly customizable source for Pipedrive, supports endpoint addition, selection and column rename
+Pipedrive api docs: https://developers.pipedrive.com/docs/api/v1
+Pipedrive changes or deprecates fields and endpoints without versioning the api.
+If something breaks, it's a good idea to check the changelog.
+Api changelog: https://developers.pipedrive.com/changelog
+To get an api key: https://pipedrive.readme.io/docs/how-to-find-the-api-token
+"""
+from typing import Any, Dict, Iterator, List, Optional, Union  # noqa: F401
+import dlt
+from dlt.common import pendulum
+from dlt.common.time import ensure_pendulum_datetime
+from dlt.sources import DltResource, TDataItems
+from .helpers import group_deal_flows
+from .helpers.custom_fields_munger import rename_fields, update_fields_mapping
+from .helpers.pages import get_pages, get_recent_items_incremental
+from .settings import ENTITY_MAPPINGS, RECENTS_ENTITIES
+from .typing import TDataPage
+@dlt.source(name="pipedrive", max_table_nesting=0)
+def pipedrive_source(
+    pipedrive_api_key: str = dlt.secrets.value,
+    since_timestamp: Optional[Union[pendulum.DateTime, str]] = "1970-01-01 00:00:00",
+) -> Iterator[DltResource]:
+    """
+    Get data from the Pipedrive API. Supports incremental loading and custom fields mapping.
+    Args:
+        pipedrive_api_key: https://pipedrive.readme.io/docs/how-to-find-the-api-token
+        since_timestamp: Starting timestamp for incremental loading. By default complete history is loaded on first run.
+        incremental: Enable or disable incremental loading.
+    Returns resources:
+        custom_fields_mapping
+        activities
+        activityTypes
+        deals
+        deals_flow
+        deals_participants
+        files
+        filters
+        notes
+        persons
+        organizations
+        pipelines
+        products
+        stages
+        users
+        leads
+    For custom fields rename the `custom_fields_mapping` resource must be selected or loaded before other resources.
+    Resources that depend on another resource are implemented as transformers
+    so they can re-use the original resource data without re-downloading.
+    Examples:  deals_participants, deals_flow
+    """
+    # yield nice rename mapping
+    yield create_state(pipedrive_api_key) | parsed_mapping
+    # parse timestamp and build kwargs
+    since_timestamp = ensure_pendulum_datetime(since_timestamp).strftime(
+        "%Y-%m-%d %H:%M:%S"
+    )
+    resource_kwargs: Any = (
+        {"since_timestamp": since_timestamp} if since_timestamp else {}
+    )
+    # create resources for all endpoints
+    endpoints_resources = {}
+    for entity, resource_name in RECENTS_ENTITIES.items():
+        endpoints_resources[resource_name] = dlt.resource(
+            get_recent_items_incremental,
+            name=resource_name,
+            primary_key="id",
+            write_disposition="merge",
+        )(entity, pipedrive_api_key, **resource_kwargs)
+    yield from endpoints_resources.values()
+    # create transformers for deals to participants and flows
+    yield endpoints_resources["deals"] | dlt.transformer(
+        name="deals_participants", write_disposition="merge", primary_key="id"
+    )(_get_deals_participants)(pipedrive_api_key)
+    yield endpoints_resources["deals"] | dlt.transformer(
+        name="deals_flow", write_disposition="merge", primary_key="id"
+    )(_get_deals_flow)(pipedrive_api_key)
+    yield leads(pipedrive_api_key, update_time=since_timestamp)
+def _get_deals_flow(
+    deals_page: TDataPage, pipedrive_api_key: str
+) -> Iterator[TDataItems]:
+    custom_fields_mapping = dlt.current.source_state().get("custom_fields_mapping", {})
+    for row in deals_page:
+        url = f"deals/{row['id']}/flow"
+        pages = get_pages(url, pipedrive_api_key)
+        for entity, page in group_deal_flows(pages):
+            yield dlt.mark.with_table_name(
+                rename_fields(page, custom_fields_mapping.get(entity, {})),
+                "deals_flow_" + entity,
+            )
+def _get_deals_participants(
+    deals_page: TDataPage, pipedrive_api_key: str
+) -> Iterator[TDataPage]:
+    for row in deals_page:
+        url = f"deals/{row['id']}/participants"
+        yield from get_pages(url, pipedrive_api_key)
+@dlt.resource(selected=False)
+def create_state(pipedrive_api_key: str) -> Iterator[Dict[str, Any]]:
+    def _get_pages_for_rename(
+        entity: str, fields_entity: str, pipedrive_api_key: str
+    ) -> Dict[str, Any]:
+        existing_fields_mapping: Dict[str, Dict[str, str]] = (
+            custom_fields_mapping.setdefault(entity, {})
+        )
+        # we need to process all pages before yielding
+        for page in get_pages(fields_entity, pipedrive_api_key):
+            existing_fields_mapping = update_fields_mapping(
+                page, existing_fields_mapping
+            )
+        return existing_fields_mapping
+    # gets all *Fields data and stores in state
+    custom_fields_mapping = dlt.current.source_state().setdefault(
+        "custom_fields_mapping", {}
+    )
+    for entity, fields_entity, _ in ENTITY_MAPPINGS:
+        if fields_entity is None:
+            continue
+        custom_fields_mapping[entity] = _get_pages_for_rename(
+            entity, fields_entity, pipedrive_api_key
+        )
+    yield custom_fields_mapping
+@dlt.transformer(
+    name="custom_fields_mapping",
+    write_disposition="replace",
+    columns={"options": {"data_type": "json"}},
+)
+def parsed_mapping(
+    custom_fields_mapping: Dict[str, Any],
+) -> Optional[Iterator[List[Dict[str, str]]]]:
+    """
+    Parses and yields custom fields' mapping in order to be stored in destiny by dlt
+    """
+    for endpoint, data_item_mapping in custom_fields_mapping.items():
+        yield [
+            {
+                "endpoint": endpoint,
+                "hash_string": hash_string,
+                "name": names["name"],
+                "normalized_name": names["normalized_name"],
+                "options": names["options"],
+                "field_type": names["field_type"],
+            }
+            for hash_string, names in data_item_mapping.items()
+        ]
+@dlt.resource(primary_key="id", write_disposition="merge")
+def leads(
+    pipedrive_api_key: str = dlt.secrets.value,
+    update_time: dlt.sources.incremental[str] = dlt.sources.incremental(
+        "update_time", "1970-01-01 00:00:00"
+    ),
+) -> Iterator[TDataPage]:
+    """Resource to incrementally load pipedrive leads by update_time"""
+    # Leads inherit custom fields from deals
+    fields_mapping = (
+        dlt.current.source_state().get("custom_fields_mapping", {}).get("deals", {})
+    )
+    # Load leads pages sorted from newest to oldest and stop loading when
+    # last incremental value is reached
+    pages = get_pages(
+        "leads",
+        pipedrive_api_key,
+        extra_params={"sort": "update_time DESC"},
+    )
+    for page in pages:
+        yield rename_fields(page, fields_mapping)
+        if update_time.start_out_of_range:
+            return

ingestr/src/pipedrive/helpers/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Pipedrive source helpers"""
+from itertools import groupby
+from typing import Any, Dict, Iterable, List, Tuple, cast  # noqa: F401
+from dlt.common import pendulum  # noqa: F401
+def _deals_flow_group_key(item: Dict[str, Any]) -> str:
+    return item["object"]  # type: ignore[no-any-return]
+def group_deal_flows(
+    pages: Iterable[Iterable[Dict[str, Any]]],
+) -> Iterable[Tuple[str, List[Dict[str, Any]]]]:
+    for page in pages:
+        for entity, items in groupby(
+            sorted(page, key=_deals_flow_group_key), key=_deals_flow_group_key
+        ):
+            yield (
+                entity,
+                [dict(item["data"], timestamp=item["timestamp"]) for item in items],
+            )

ingestr/src/pipedrive/helpers/custom_fields_munger.py ADDED Viewed

@@ -0,0 +1,102 @@
+from typing import Any, Dict, Optional, TypedDict
+import dlt
+from ..typing import TDataPage
+class TFieldMapping(TypedDict):
+    name: str
+    normalized_name: str
+    options: Optional[Dict[str, str]]
+    field_type: str
+def update_fields_mapping(
+    new_fields_mapping: TDataPage, existing_fields_mapping: Dict[str, Any]
+) -> Dict[str, Any]:
+    """
+    Specific function to perform data munging and push changes to custom fields' mapping stored in dlt's state
+    The endpoint must be an entity fields' endpoint
+    """
+    for data_item in new_fields_mapping:
+        # 'edit_flag' field contains a boolean value, which is set to 'True' for custom fields and 'False' otherwise.
+        if data_item.get("edit_flag"):
+            # Regarding custom fields, 'key' field contains pipedrive's hash string representation of its name
+            # We assume that pipedrive's hash strings are meant to be an univoque representation of custom fields' name, so dlt's state shouldn't be updated while those values
+            # remain unchanged
+            existing_fields_mapping = _update_field(data_item, existing_fields_mapping)
+        # Built in enum and set fields are mapped if their options have int ids
+        # Enum fields with bool and string key options are left intact
+        elif data_item.get("field_type") in {"set", "enum"}:
+            options = data_item.get("options", [])
+            first_option = options[0]["id"] if len(options) >= 1 else None
+            if isinstance(first_option, int) and not isinstance(first_option, bool):
+                existing_fields_mapping = _update_field(
+                    data_item, existing_fields_mapping
+                )
+    return existing_fields_mapping
+def _update_field(
+    data_item: Dict[str, Any],
+    existing_fields_mapping: Optional[Dict[str, TFieldMapping]],
+) -> Dict[str, TFieldMapping]:
+    """Create or update the given field's info the custom fields state
+    If the field hash already exists in the state from previous runs the name is not updated.
+    New enum options (if any) are appended to the state.
+    """
+    existing_fields_mapping = existing_fields_mapping or {}
+    key = data_item["key"]
+    options = data_item.get("options", [])
+    new_options_map = {str(o["id"]): o["label"] for o in options}
+    existing_field = existing_fields_mapping.get(key)
+    if not existing_field:
+        existing_fields_mapping[key] = dict(
+            name=data_item["name"],
+            normalized_name=_normalized_name(data_item["name"]),
+            options=new_options_map,
+            field_type=data_item["field_type"],
+        )
+        return existing_fields_mapping
+    existing_options = existing_field.get("options", {})
+    if not existing_options or existing_options == new_options_map:
+        existing_field["options"] = new_options_map
+        existing_field["field_type"] = data_item[
+            "field_type"
+        ]  # Add for backwards compat
+        return existing_fields_mapping
+    # Add new enum options to the existing options array
+    # so that when option is renamed the original label remains valid
+    new_option_keys = set(new_options_map) - set(existing_options)
+    for key in new_option_keys:
+        existing_options[key] = new_options_map[key]
+    existing_field["options"] = existing_options
+    return existing_fields_mapping
+def _normalized_name(name: str) -> str:
+    source_schema = dlt.current.source_schema()
+    normalized_name = name.strip()  # remove leading and trailing spaces
+    return source_schema.naming.normalize_identifier(normalized_name)
+def rename_fields(data: TDataPage, fields_mapping: Dict[str, Any]) -> TDataPage:
+    if not fields_mapping:
+        return data
+    for data_item in data:
+        for hash_string, field in fields_mapping.items():
+            if hash_string not in data_item:
+                continue
+            field_value = data_item.pop(hash_string)
+            field_name = field["name"]
+            options_map = field["options"]
+            # Get label instead of ID for 'enum' and 'set' fields
+            if field_value and field["field_type"] == "set":  # Multiple choice
+                field_value = [
+                    options_map.get(str(enum_id), enum_id) for enum_id in field_value
+                ]
+            elif field_value and field["field_type"] == "enum":
+                field_value = options_map.get(str(field_value), field_value)
+            data_item[field_name] = field_value
+    return data

ingestr/src/pipedrive/helpers/pages.py ADDED Viewed

@@ -0,0 +1,115 @@
+from itertools import chain
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    TypeVar,
+    Union,
+)
+import dlt
+from dlt.sources.helpers import requests
+from ..typing import TDataPage
+from .custom_fields_munger import rename_fields
+def get_pages(
+    entity: str, pipedrive_api_key: str, extra_params: Dict[str, Any] = None
+) -> Iterator[List[Dict[str, Any]]]:
+    """
+    Generic method to retrieve endpoint data based on the required headers and params.
+    Args:
+        entity: the endpoint you want to call
+        pipedrive_api_key:
+        extra_params: any needed request params except pagination.
+    Returns:
+    """
+    headers = {"Content-Type": "application/json"}
+    params = {"api_token": pipedrive_api_key}
+    if extra_params:
+        params.update(extra_params)
+    url = f"https://app.pipedrive.com/v1/{entity}"
+    yield from _paginated_get(url, headers=headers, params=params)
+def get_recent_items_incremental(
+    entity: str,
+    pipedrive_api_key: str,
+    since_timestamp: dlt.sources.incremental[str] = dlt.sources.incremental(
+        "update_time|modified", "1970-01-01 00:00:00"
+    ),
+) -> Iterator[TDataPage]:
+    """Get a specific entity type from /recents with incremental state."""
+    yield from _get_recent_pages(entity, pipedrive_api_key, since_timestamp.last_value)
+def _paginated_get(
+    url: str, headers: Dict[str, Any], params: Dict[str, Any]
+) -> Iterator[List[Dict[str, Any]]]:
+    """
+    Requests and yields data 500 records at a time
+    Documentation: https://pipedrive.readme.io/docs/core-api-concepts-pagination
+    """
+    # pagination start and page limit
+    params["start"] = 0
+    params["limit"] = 500
+    while True:
+        page = requests.get(url, headers=headers, params=params).json()
+        # yield data only
+        data = page["data"]
+        if data:
+            yield data
+        # check if next page exists
+        pagination_info = page.get("additional_data", {}).get("pagination", {})
+        # is_next_page is set to True or False
+        if not pagination_info.get("more_items_in_collection", False):
+            break
+        params["start"] = pagination_info.get("next_start")
+T = TypeVar("T")
+def _extract_recents_data(data: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Results from recents endpoint contain `data` key which is either a single entity or list of entities
+    This returns a flat list of entities from an iterable of recent results
+    """
+    return [
+        data_item
+        for data_item in chain.from_iterable(
+            (_list_wrapped(item["data"]) for item in data)
+        )
+        if data_item is not None
+    ]
+def _list_wrapped(item: Union[List[T], T]) -> List[T]:
+    if isinstance(item, list):
+        return item
+    return [item]
+def _get_recent_pages(
+    entity: str, pipedrive_api_key: str, since_timestamp: str
+) -> Iterator[TDataPage]:
+    custom_fields_mapping = (
+        dlt.current.source_state().get("custom_fields_mapping", {}).get(entity, {})
+    )
+    pages = get_pages(
+        "recents",
+        pipedrive_api_key,
+        extra_params=dict(since_timestamp=since_timestamp, items=entity),
+    )
+    pages = (_extract_recents_data(page) for page in pages)
+    for page in pages:
+        yield rename_fields(page, custom_fields_mapping)
+__source_name__ = "pipedrive"

ingestr/src/pipedrive/settings.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Pipedrive source settings and constants"""
+ENTITY_MAPPINGS = [
+    ("activity", "activityFields", {"user_id": 0}),
+    ("organization", "organizationFields", None),
+    ("person", "personFields", None),
+    ("product", "productFields", None),
+    ("deal", "dealFields", None),
+    ("pipeline", None, None),
+    ("stage", None, None),
+    ("user", None, None),
+]
+RECENTS_ENTITIES = {
+    "activity": "activities",
+    "activityType": "activity_types",
+    "deal": "deals",
+    "file": "files",
+    "filter": "filters",
+    "note": "notes",
+    "person": "persons",
+    "organization": "organizations",
+    "pipeline": "pipelines",
+    "product": "products",
+    "stage": "stages",
+    "user": "users",
+}

ingestr/src/pipedrive/typing.py ADDED Viewed

@@ -0,0 +1,3 @@
+from typing import Any, Dict, List
+TDataPage = List[Dict[str, Any]]

ingestr/src/sources.py CHANGED Viewed

@@ -85,6 +85,7 @@ from ingestr.src.linkedin_ads.dimension_time_enum import (
 from ingestr.src.mongodb import mongodb_collection
 from ingestr.src.notion import notion_databases
 from ingestr.src.personio import personio_source
+from ingestr.src.pipedrive import pipedrive_source
 from ingestr.src.salesforce import salesforce_source
 from ingestr.src.shopify import shopify_source
 from ingestr.src.slack import slack_source
@@ -177,7 +178,7 @@ class SqlSource:
                 scheme="clickhouse+native",
                 query=urlencode(query_params, doseq=True),
             ).geturl()
         if uri.startswith("db2://"):
             uri = uri.replace("db2://", "db2+ibm_db://")
@@ -1837,8 +1838,8 @@ class AppLovinSource:
 class ApplovinMaxSource:
-    #expected uri format: applovinmax://?api_key=<api_key>
-    #expected table format: user_ad_revenue:app_id_1,app_id_2
+    # expected uri format: applovinmax://?api_key=<api_key>
+    # expected table format: user_ad_revenue:app_id_1,app_id_2
     def handles_incrementality(self) -> bool:
         return True
@@ -1850,7 +1851,7 @@ class ApplovinMaxSource:
         api_key = params.get("api_key")
         if api_key is None:
             raise ValueError("api_key is required to connect to AppLovin Max API.")
         AVAILABLE_TABLES = ["user_ad_revenue"]
         table_fields = table.split(":")
@@ -1860,7 +1861,7 @@ class ApplovinMaxSource:
             raise ValueError(
                 "Invalid table format. Expected format is user_ad_revenue:app_id_1,app_id_2"
             )
         if requested_table not in AVAILABLE_TABLES:
             raise ValueError(
                 f"Table name '{requested_table}' is not supported for AppLovin Max source yet."
@@ -1868,17 +1869,15 @@ class ApplovinMaxSource:
                 "If you need additional tables, please create a GitHub issue at "
                 "https://github.com/bruin-data/ingestr"
             )
-        applications = [i for i in table_fields[1].replace(" ", "").split(",") if i.strip()]
+        applications = [
+            i for i in table_fields[1].replace(" ", "").split(",") if i.strip()
+        ]
         if len(applications) == 0:
-            raise ValueError(
-                "At least one application id is required"
-            )
+            raise ValueError("At least one application id is required")
         if len(applications) != len(set(applications)):
-            raise ValueError(
-                "Application ids must be unique."
-            )
+            raise ValueError("Application ids must be unique.")
         interval_start = kwargs.get("interval_start")
         interval_end = kwargs.get("interval_end")
@@ -2009,3 +2008,36 @@ class KinesisSource:
         return kinesis_stream(
             stream_name=table, credentials=credentials, initial_at_timestamp=start_date
         )
+class PipedriveSource:
+    def handles_incrementality(self) -> bool:
+        return True
+    def dlt_source(self, uri: str, table: str, **kwargs):
+        parsed_uri = urlparse(uri)
+        params = parse_qs(parsed_uri.query)
+        api_key = params.get("api_token")
+        if api_key is None:
+            raise MissingValueError("api_token", "Pipedrive")
+        start_date = kwargs.get("interval_start")
+        if start_date is not None:
+            start_date = ensure_pendulum_datetime(start_date)
+        else:
+            start_date = pendulum.parse("2000-01-01")
+        if table not in [
+            "users",
+            "activities",
+            "persons",
+            "organizations",
+            "products",
+            "stages",
+            "deals",
+        ]:
+            raise UnsupportedResourceError(table, "Pipedrive")
+        return pipedrive_source(
+            pipedrive_api_key=api_key, since_timestamp=start_date
+        ).with_resources(table)

{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ingestr
-Version: 0.13.20
+Version: 0.13.22
 Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
 Project-URL: Homepage, https://github.com/bruin-data/ingestr
 Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -16,7 +16,7 @@ Classifier: Topic :: Database
 Requires-Python: >=3.9
 Requires-Dist: aiobotocore==2.21.1
 Requires-Dist: aiohappyeyeballs==2.4.8
-Requires-Dist: aiohttp==3.11.13
+Requires-Dist: aiohttp==3.11.15
 Requires-Dist: aioitertools==0.12.0
 Requires-Dist: aiosignal==1.3.2
 Requires-Dist: alembic==1.15.1
@@ -55,8 +55,8 @@ Requires-Dist: facebook-business==20.0.0
 Requires-Dist: filelock==3.17.0
 Requires-Dist: flatten-json==0.1.14
 Requires-Dist: frozenlist==1.5.0
-Requires-Dist: fsspec==2024.10.0
-Requires-Dist: gcsfs==2024.10.0
+Requires-Dist: fsspec==2025.3.2
+Requires-Dist: gcsfs==2025.3.2
 Requires-Dist: gitdb==4.0.12
 Requires-Dist: gitpython==3.1.44
 Requires-Dist: giturlparse==0.12.0
@@ -149,7 +149,7 @@ Requires-Dist: rich-argparse==1.7.0
 Requires-Dist: rich==13.9.4
 Requires-Dist: rsa==4.9
 Requires-Dist: rudder-sdk-python==2.1.4
-Requires-Dist: s3fs==2024.10.0
+Requires-Dist: s3fs==2025.3.2
 Requires-Dist: s3transfer==0.11.3
 Requires-Dist: scramp==1.4.5
 Requires-Dist: semver==3.0.4

{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,23 @@
-ingestr/main.py,sha256=0KTNvWPaMrEVcHN6p8Vvffhui8e6OjlNY9UrsGbh36I,24715
+ingestr/main.py,sha256=74lbiWEa27MUKFPbyUNGIlrwD5fRxej5cKFwe_LX1pE,25452
 ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
 ingestr/src/blob.py,sha256=onMe5ZHxPXTdcB_s2oGNdMo-XQJ3ajwOsWE9eSTGFmc,1495
-ingestr/src/buildinfo.py,sha256=EBuceGyhPAkrN4l1DhYfU8x1n5cj9U52NNldhLqPt_U,21
+ingestr/src/buildinfo.py,sha256=ExEPLDyz3-FkQx0OHsblNsR-B9G1fUx77cQtxlv6CXA,21
 ingestr/src/destinations.py,sha256=vrGij4qMPCdXTMIimROWBJFqzOqCM4DFmgyubgSHejA,11279
 ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
-ingestr/src/factory.py,sha256=-Mv9vWvY7PJOnOSMvLYq6zLtPLYpTORLD6bUvf8xUc8,5209
+ingestr/src/factory.py,sha256=1jqcLv_QUUGeyg1OYN3ywrRdcDZyDRtMOongwyjDapU,5268
 ingestr/src/filters.py,sha256=0JQXeAr2APFMnW2sd-6BlAMWv93bXV17j8b5MM8sHmM,580
 ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
-ingestr/src/partition.py,sha256=ZqcCTz6xrSurgJDSZYn-_gaBaEXj4peV7N1wYe3IiQk,953
+ingestr/src/partition.py,sha256=E0WHqh1FTheQAIVK_-jWUx0dgyYZCD1VxlAm362gao4,964
 ingestr/src/resource.py,sha256=XG-sbBapFVEM7OhHQFQRTdTLlh-mHB-N4V1t8F8Tsww,543
-ingestr/src/sources.py,sha256=7WL0QVfQJNwyGLCJzIa_m3d7z-eA3etp-klIErtGvco,71081
+ingestr/src/sources.py,sha256=kGsgFWf8Ghha0-HlC6PlDIIKX2Lriah4UmAseziGdr4,72035
 ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
 ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
 ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
 ingestr/src/adjust/__init__.py,sha256=ULjtJqrNS6XDvUyGl0tjl12-tLyXlCgeFe2icTbtu3Q,3255
-ingestr/src/adjust/adjust_helpers.py,sha256=8PbDTmO3dnPrVGBA1qCmkJamVZOLN52oE1gkSsfDeZM,3682
+ingestr/src/adjust/adjust_helpers.py,sha256=G_EvNuvA7CsaOtbV3g249iAyggMDMZYbtWOzOAz_EjY,3742
 ingestr/src/airtable/__init__.py,sha256=GHWYrjI2qhs_JihdNJysB0Ni3bzqT_MLXn_S9_Q5zRA,2775
 ingestr/src/applovin/__init__.py,sha256=X_YCLppPrnL8KXfYWICE_uDfMzHHH3JZ-DBGZ1RlaOI,6984
-ingestr/src/applovin_max/__init__.py,sha256=o0aL4jBZqwK528MVw9dS1G5EZbF4tx6_Ef0IfqkhAT0,3294
+ingestr/src/applovin_max/__init__.py,sha256=CBMADQ23gi0oxAsxe-RV67GGb8I4EFOX_It45Vv9dj4,3315
 ingestr/src/appsflyer/_init_.py,sha256=ne2-9FQ654Drtd3GkKQv8Bwb6LEqCnJw49MfO5Jyzgs,739
 ingestr/src/appsflyer/client.py,sha256=TNmwakLzmO6DZW3wcfLfQRl7aNBHgFqSsk4ef-MmJ1w,3084
 ingestr/src/appstore/__init__.py,sha256=3P4VZH2WJF477QjW19jMTwu6L8DXcLkYSdutnvp3AmM,4742
@@ -58,12 +58,12 @@ ingestr/src/google_sheets/helpers/api_calls.py,sha256=RiVfdacbaneszhmuhYilkJnkc9
 ingestr/src/google_sheets/helpers/data_processing.py,sha256=WYO6z4XjGcG0Hat2J2enb-eLX5mSNVb2vaqRE83FBWU,11000
 ingestr/src/gorgias/__init__.py,sha256=_mFkMYwlY5OKEY0o_FK1OKol03A-8uk7bm1cKlmt5cs,21432
 ingestr/src/gorgias/helpers.py,sha256=DamuijnvhGY9hysQO4txrVMf4izkGbh5qfBKImdOINE,5427
-ingestr/src/hubspot/__init__.py,sha256=jDI9PIgz2cvRMbTtX2tF1FcATFpBOMI7M8Ua4G7qDCc,9465
+ingestr/src/hubspot/__init__.py,sha256=NYgSIAPXQh2Qp1eKun7TgcerKogq6pWtNkr-_f0FXbI,9464
 ingestr/src/hubspot/helpers.py,sha256=PTn-UHJv1ENIvA5azUTaHCmFXgmHLJC1tUatQ1N-KFE,6727
 ingestr/src/hubspot/settings.py,sha256=9P1OKiRL88kl_m8n1HhuG-Qpq9VGbqPLn5Q0QYneToU,2193
 ingestr/src/kafka/__init__.py,sha256=wMCXdiraeKd1Kssi9WcVCGZaNGm2tJEtnNyuB4aR5_k,3541
 ingestr/src/kafka/helpers.py,sha256=V9WcVn3PKnEpggArHda4vnAcaV8VDuh__dSmRviJb5Y,7502
-ingestr/src/kinesis/__init__.py,sha256=6Sqg8VQp0yhz8xCm1LPvciOUT8SrD6u5Klmfu7p4Y8c,6229
+ingestr/src/kinesis/__init__.py,sha256=u5ThH1y8uObZKXgIo71em1UnX6MsVHWOjcf1jKqKbE8,6205
 ingestr/src/kinesis/helpers.py,sha256=aF0GCDKSectaaW8XPdERY_6bUs0ky19dcBs24ZFn-o0,2473
 ingestr/src/klaviyo/_init_.py,sha256=ucWHqBe8DQvXVpbmxKFAV5ljpCFb4ps_2QTD0OSiWxY,7905
 ingestr/src/klaviyo/client.py,sha256=tPj79ia7AW0ZOJhzlKNPCliGbdojRNwUFp8HvB2ym5s,7434
@@ -80,6 +80,12 @@ ingestr/src/notion/helpers/client.py,sha256=QXuudkf5Zzff98HRsCqA1g1EZWIrnfn1falP
 ingestr/src/notion/helpers/database.py,sha256=gigPibTeVefP3lA-8w4aOwX67pj7RlciPk5koDs1ry8,2737
 ingestr/src/personio/__init__.py,sha256=sHYpoV-rg-kA1YsflctChis0hKcTrL6mka9O0CHV4zA,11638
 ingestr/src/personio/helpers.py,sha256=EKmBN0Lf4R0lc3yqqs7D-RjoZ75E8gPcctt59xwHxrY,2901
+ingestr/src/pipedrive/__init__.py,sha256=iRrxeMwo8_83ptgGnTFTNHV1nYvIsFfg0a3XzugPYeI,6982
+ingestr/src/pipedrive/settings.py,sha256=q119Fy4C5Ip1rMoCILX2BkHV3bwiXC_dW58KIiDUzsY,708
+ingestr/src/pipedrive/typing.py,sha256=lEMXu4hhAA3XkhVSlBUa-juqyupisd3c-qSQKxFvzoE,69
+ingestr/src/pipedrive/helpers/__init__.py,sha256=UX1K_qnGXB0ShtnBOfp2XuVbK8RRoCK8TdEmIjRckgg,710
+ingestr/src/pipedrive/helpers/custom_fields_munger.py,sha256=rZ4AjdITHfJE2NNomCR7vMBS1KnWpEGVF6fADwsIHUE,4488
+ingestr/src/pipedrive/helpers/pages.py,sha256=Klpjw2OnMuhzit3PpiHKsfzGcJ3rQPSQBl3HhE3-6eA,3358
 ingestr/src/salesforce/__init__.py,sha256=2hik5pRrxVODdDTlUEMoyccNC07zozjnxkMHcjMT1qA,4558
 ingestr/src/salesforce/helpers.py,sha256=QTdazBt-qRTBbCQMZnyclIaDQFmBixBy_RDKD00Lt-8,2492
 ingestr/src/shopify/__init__.py,sha256=PF_6VQnS065Br1UzSIekTVXBu3WtrMQL_v5CfbfaX5Y,63151
@@ -112,8 +118,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
 ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
 ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
 ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
-ingestr-0.13.20.dist-info/METADATA,sha256=N3P4HTln7RTUxn6Hk42Et7nN6WBSPMN8G3wWgXuUrTY,13630
-ingestr-0.13.20.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-ingestr-0.13.20.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
-ingestr-0.13.20.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
-ingestr-0.13.20.dist-info/RECORD,,
+ingestr-0.13.22.dist-info/METADATA,sha256=SC89LgkVuV22LAaSCETkDoT6bFYCgIkHjLgs2UP4q4c,13627
+ingestr-0.13.22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+ingestr-0.13.22.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
+ingestr-0.13.22.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
+ingestr-0.13.22.dist-info/RECORD,,

{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/WHEEL RENAMED Viewed

File without changes

{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ingestr-0.13.20.dist-info → ingestr-0.13.22.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

ingestr 0.13.20__py3-none-any.whl → 0.13.22__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.20py3-none-any.whl → 0.13.22py3-none-any.whl