PyPI - ingestr - Versions diffs - 0.8.3__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

ingestr 0.8.3py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (19) hide show

ingestr/src/factory.py +6 -0
ingestr/src/filesystem/__init__.py +98 -0
ingestr/src/filesystem/helpers.py +100 -0
ingestr/src/filesystem/readers.py +131 -0
ingestr/src/shopify/__init__.py +3 -1
ingestr/src/shopify/helpers.py +1 -1
ingestr/src/sources.py +149 -2
ingestr/src/version.py +1 -1
ingestr/src/zendesk/__init__.py +460 -0
ingestr/src/zendesk/helpers/__init__.py +25 -0
ingestr/src/zendesk/helpers/api_helpers.py +105 -0
ingestr/src/zendesk/helpers/credentials.py +54 -0
ingestr/src/zendesk/helpers/talk_api.py +118 -0
ingestr/src/zendesk/settings.py +57 -0
{ingestr-0.8.3.dist-info → ingestr-0.9.0.dist-info}/METADATA +12 -1
{ingestr-0.8.3.dist-info → ingestr-0.9.0.dist-info}/RECORD +19 -10
{ingestr-0.8.3.dist-info → ingestr-0.9.0.dist-info}/WHEEL +0 -0
{ingestr-0.8.3.dist-info → ingestr-0.9.0.dist-info}/entry_points.txt +0 -0
{ingestr-0.8.3.dist-info → ingestr-0.9.0.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/factory.py CHANGED Viewed

@@ -28,10 +28,12 @@ from ingestr.src.sources import (
     LocalCsvSource,
     MongoDbSource,
     NotionSource,
+    S3Source,
     ShopifySource,
     SlackSource,
     SqlSource,
     StripeAnalyticsSource,
+    ZendeskSource,
 )
 SQL_SOURCE_SCHEMES = [
@@ -132,6 +134,10 @@ class SourceDestinationFactory:
             return KafkaSource()
         elif self.source_scheme == "adjust":
             return AdjustSource()
+        elif self.source_scheme == "zendesk":
+            return ZendeskSource()
+        elif self.source_scheme == "s3":
+            return S3Source()
         else:
             raise ValueError(f"Unsupported source scheme: {self.source_scheme}")

ingestr/src/filesystem/__init__.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Reads files in s3, gs or azure buckets using fsspec and provides convenience resources for chunked reading of various file formats"""
+from typing import Iterator, List, Optional, Tuple, Union
+import dlt
+from dlt.sources import DltResource
+from dlt.sources.credentials import FileSystemCredentials
+from dlt.sources.filesystem import FileItem, FileItemDict, fsspec_filesystem, glob_files
+from .helpers import (
+    AbstractFileSystem,
+    FilesystemConfigurationResource,
+)
+from .readers import (
+    ReadersSource,
+    _read_csv,
+    _read_csv_duckdb,
+    _read_jsonl,
+    _read_parquet,
+)
+@dlt.source(_impl_cls=ReadersSource, spec=FilesystemConfigurationResource)
+def readers(
+    bucket_url: str,
+    credentials: Union[FileSystemCredentials, AbstractFileSystem],
+    file_glob: Optional[str] = "*",
+) -> Tuple[DltResource, ...]:
+    """This source provides a few resources that are chunked file readers. Readers can be further parametrized before use
+       read_csv(chunksize, **pandas_kwargs)
+       read_jsonl(chunksize)
+       read_parquet(chunksize)
+    Args:
+        bucket_url (str): The url to the bucket.
+        credentials (FileSystemCredentials | AbstractFilesystem): The credentials to the filesystem of fsspec `AbstractFilesystem` instance.
+        file_glob (str, optional): The filter to apply to the files in glob format. by default lists all files in bucket_url non-recursively
+    """
+    filesystem_resource = filesystem(bucket_url, credentials, file_glob=file_glob)
+    filesystem_resource.apply_hints(
+        incremental=dlt.sources.incremental("modification_date")
+    )
+    return (
+        filesystem_resource | dlt.transformer(name="read_csv")(_read_csv),
+        filesystem_resource | dlt.transformer(name="read_jsonl")(_read_jsonl),
+        filesystem_resource | dlt.transformer(name="read_parquet")(_read_parquet),
+        filesystem_resource | dlt.transformer(name="read_csv_duckdb")(_read_csv_duckdb),
+    )
+@dlt.resource(
+    primary_key="file_url", spec=FilesystemConfigurationResource, standalone=True
+)
+def filesystem(
+    bucket_url: str = dlt.secrets.value,
+    credentials: Union[FileSystemCredentials, AbstractFileSystem] = dlt.secrets.value,
+    file_glob: Optional[str] = "*",
+    files_per_page: int = 100,
+    extract_content: bool = True,
+) -> Iterator[List[FileItem]]:
+    """This resource lists files in `bucket_url` using `file_glob` pattern. The files are yielded as FileItem which also
+    provide methods to open and read file data. It should be combined with transformers that further process (ie. load files)
+    Args:
+        bucket_url (str): The url to the bucket.
+        credentials (FileSystemCredentials | AbstractFilesystem): The credentials to the filesystem of fsspec `AbstractFilesystem` instance.
+        file_glob (str, optional): The filter to apply to the files in glob format. by default lists all files in bucket_url non-recursively
+        files_per_page (int, optional): The number of files to process at once, defaults to 100.
+        extract_content (bool, optional): If true, the content of the file will be extracted if
+            false it will return a fsspec file, defaults to False.
+    Returns:
+        Iterator[List[FileItem]]: The list of files.
+    """
+    if isinstance(credentials, AbstractFileSystem):
+        fs_client = credentials
+    else:
+        fs_client = fsspec_filesystem(bucket_url, credentials)[0]
+    files_chunk: List[FileItem] = []
+    for file_model in glob_files(fs_client, bucket_url, file_glob):
+        file_dict = FileItemDict(file_model, credentials)
+        if extract_content:
+            file_dict["file_content"] = file_dict.read_bytes()
+        files_chunk.append(file_dict)  # type: ignore
+        # wait for the chunk to be full
+        if len(files_chunk) >= files_per_page:
+            yield files_chunk
+            files_chunk = []
+    if files_chunk:
+        yield files_chunk
+read_csv = dlt.transformer(standalone=True)(_read_csv)
+read_jsonl = dlt.transformer(standalone=True)(_read_jsonl)
+read_parquet = dlt.transformer(standalone=True)(_read_parquet)
+read_csv_duckdb = dlt.transformer(standalone=True)(_read_csv_duckdb)

ingestr/src/filesystem/helpers.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Helpers for the filesystem resource."""
+from typing import Any, Dict, Iterable, List, Optional, Type, Union
+import dlt
+from dlt.common.configuration import resolve_type
+from dlt.common.typing import TDataItem
+from dlt.sources import DltResource
+from dlt.sources.config import configspec, with_config
+from dlt.sources.credentials import (
+    CredentialsConfiguration,
+    FilesystemConfiguration,
+    FileSystemCredentials,
+)
+from dlt.sources.filesystem import fsspec_filesystem
+from fsspec import AbstractFileSystem  # type: ignore
+@configspec
+class FilesystemConfigurationResource(FilesystemConfiguration):
+    credentials: Union[FileSystemCredentials, AbstractFileSystem] = None
+    file_glob: Optional[str] = "*"
+    files_per_page: int = 100
+    extract_content: bool = False
+    @resolve_type("credentials")
+    def resolve_credentials_type(self) -> Type[CredentialsConfiguration]:
+        # use known credentials or empty credentials for unknown protocol
+        return Union[
+            self.PROTOCOL_CREDENTIALS.get(self.protocol)
+            or Optional[CredentialsConfiguration],
+            AbstractFileSystem,
+        ]  # type: ignore[return-value]
+def fsspec_from_resource(filesystem_instance: DltResource) -> AbstractFileSystem:
+    """Extract authorized fsspec client from a filesystem resource"""
+    @with_config(
+        spec=FilesystemConfiguration,
+        sections=("sources", filesystem_instance.section, filesystem_instance.name),
+    )
+    def _get_fsspec(
+        bucket_url: str, credentials: Optional[FileSystemCredentials]
+    ) -> AbstractFileSystem:
+        return fsspec_filesystem(bucket_url, credentials)[0]
+    return _get_fsspec(
+        filesystem_instance.explicit_args.get("bucket_url", dlt.config.value),
+        filesystem_instance.explicit_args.get("credentials", dlt.secrets.value),
+    )
+def add_columns(columns: List[str], rows: List[List[Any]]) -> List[Dict[str, Any]]:
+    """Adds column names to the given rows.
+    Args:
+        columns (List[str]): The column names.
+        rows (List[List[Any]]): The rows.
+    Returns:
+        List[Dict[str, Any]]: The rows with column names.
+    """
+    result = []
+    for row in rows:
+        result.append(dict(zip(columns, row)))
+    return result
+def fetch_arrow(file_data, chunk_size: int) -> Iterable[TDataItem]:  # type: ignore
+    """Fetches data from the given CSV file.
+    Args:
+        file_data (DuckDBPyRelation): The CSV file data.
+        chunk_size (int): The number of rows to read at once.
+    Yields:
+        Iterable[TDataItem]: Data items, read from the given CSV file.
+    """
+    batcher = file_data.fetch_arrow_reader(batch_size=chunk_size)
+    yield from batcher
+def fetch_json(file_data, chunk_size: int) -> List[Dict[str, Any]]:  # type: ignore
+    """Fetches data from the given CSV file.
+    Args:
+        file_data (DuckDBPyRelation): The CSV file data.
+        chunk_size (int): The number of rows to read at once.
+    Yields:
+        Iterable[TDataItem]: Data items, read from the given CSV file.
+    """
+    while True:
+        batch = file_data.fetchmany(chunk_size)
+        if not batch:
+            break
+        yield add_columns(file_data.columns, batch)

ingestr/src/filesystem/readers.py ADDED Viewed

@@ -0,0 +1,131 @@
+from typing import TYPE_CHECKING, Any, Iterator, Optional
+from dlt.common import json
+from dlt.common.typing import copy_sig
+from dlt.sources import DltResource, DltSource, TDataItems
+from dlt.sources.filesystem import FileItemDict
+from .helpers import fetch_arrow, fetch_json
+def _read_csv(
+    items: Iterator[FileItemDict], chunksize: int = 10000, **pandas_kwargs: Any
+) -> Iterator[TDataItems]:
+    """Reads csv file with Pandas chunk by chunk.
+    Args:
+        chunksize (int): Number of records to read in one chunk
+        **pandas_kwargs: Additional keyword arguments passed to Pandas.read_csv
+    Returns:
+        TDataItem: The file content
+    """
+    import pandas as pd
+    # apply defaults to pandas kwargs
+    kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs}
+    for file_obj in items:
+        # Here we use pandas chunksize to read the file in chunks and avoid loading the whole file
+        # in memory.
+        with file_obj.open() as file:
+            for df in pd.read_csv(file, **kwargs):
+                yield df.to_dict(orient="records")
+def _read_jsonl(
+    items: Iterator[FileItemDict], chunksize: int = 1000
+) -> Iterator[TDataItems]:
+    """Reads jsonl file content and extract the data.
+    Args:
+        chunksize (int, optional): The number of JSON lines to load and yield at once, defaults to 1000
+    Returns:
+        TDataItem: The file content
+    """
+    for file_obj in items:
+        with file_obj.open() as f:
+            lines_chunk = []
+            for line in f:
+                lines_chunk.append(json.loadb(line))
+                if len(lines_chunk) >= chunksize:
+                    yield lines_chunk
+                    lines_chunk = []
+        if lines_chunk:
+            yield lines_chunk
+def _read_parquet(
+    items: Iterator[FileItemDict],
+    chunksize: int = 10,
+) -> Iterator[TDataItems]:
+    """Reads parquet file content and extract the data.
+    Args:
+        chunksize (int, optional): The number of files to process at once, defaults to 10.
+    Returns:
+        TDataItem: The file content
+    """
+    from pyarrow import parquet as pq
+    for file_obj in items:
+        with file_obj.open() as f:
+            parquet_file = pq.ParquetFile(f)
+            for rows in parquet_file.iter_batches(batch_size=chunksize):
+                yield rows.to_pylist()
+def _read_csv_duckdb(
+    items: Iterator[FileItemDict],
+    chunk_size: Optional[int] = 5000,
+    use_pyarrow: bool = False,
+    **duckdb_kwargs: Any,
+) -> Iterator[TDataItems]:
+    """A resource to extract data from the given CSV files.
+    Uses DuckDB engine to import and cast CSV data.
+    Args:
+        items (Iterator[FileItemDict]): CSV files to read.
+        chunk_size (Optional[int]):
+            The number of rows to read at once. Defaults to 5000.
+        use_pyarrow (bool):
+            Whether to use `pyarrow` to read the data and designate
+            data schema. If set to False (by default), JSON is used.
+        duckdb_kwargs (Dict):
+            Additional keyword arguments to pass to the `read_csv()`.
+    Returns:
+        Iterable[TDataItem]: Data items, read from the given CSV files.
+    """
+    import duckdb
+    helper = fetch_arrow if use_pyarrow else fetch_json
+    for item in items:
+        with item.open() as f:
+            file_data = duckdb.from_csv_auto(f, **duckdb_kwargs)  # type: ignore
+            yield from helper(file_data, chunk_size)
+if TYPE_CHECKING:
+    class ReadersSource(DltSource):
+        """This is a typing stub that provides docstrings and signatures to the resources in `readers" source"""
+        @copy_sig(_read_csv)
+        def read_csv(self) -> DltResource: ...
+        @copy_sig(_read_jsonl)
+        def read_jsonl(self) -> DltResource: ...
+        @copy_sig(_read_parquet)
+        def read_parquet(self) -> DltResource: ...
+        @copy_sig(_read_csv_duckdb)
+        def read_csv_duckdb(self) -> DltResource: ...
+else:
+    ReadersSource = DltSource

ingestr/src/shopify/__init__.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """Fetches Shopify Orders and Products."""
-from typing import Iterable, Optional
+from typing import Any, Dict, Iterable, Optional  # noqa: F401
 import dlt
+from dlt.common import jsonpath as jp  # noqa: F401
 from dlt.common import pendulum
 from dlt.common.time import ensure_pendulum_datetime
 from dlt.common.typing import TAnyDateTime, TDataItem
@@ -12,6 +13,7 @@ from .helpers import ShopifyApi, ShopifyGraphQLApi, TOrderStatus
 from .settings import (
     DEFAULT_API_VERSION,
     DEFAULT_ITEMS_PER_PAGE,
+    DEFAULT_PARTNER_API_VERSION,  # noqa: F401
     FIRST_DAY_OF_MILLENNIUM,
 )

ingestr/src/shopify/helpers.py CHANGED Viewed

@@ -158,8 +158,8 @@ class ShopifyGraphQLApi:
         query: str,
         data_items_path: jsonpath.TJsonPath,
         pagination_cursor_path: jsonpath.TJsonPath,
-        pagination_cursor_has_next_page_path: jsonpath.TJsonPath,
         pagination_variable_name: str,
+        pagination_cursor_has_next_page_path: Optional[jsonpath.TJsonPath] = None,
         variables: Optional[DictStrAny] = None,
     ) -> Iterable[TDataItems]:
         variables = dict(variables or {})

ingestr/src/sources.py CHANGED Viewed

@@ -6,12 +6,15 @@ from typing import Any, Callable, Optional
 from urllib.parse import parse_qs, urlparse
 import dlt
+from dlt.common.configuration.specs import AwsCredentials
+from dlt.common.typing import TSecretStrValue
 from ingestr.src.adjust._init_ import adjust_source
 from ingestr.src.airtable import airtable_source
 from ingestr.src.appsflyer._init_ import appsflyer_source
 from ingestr.src.chess import source
 from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
+from ingestr.src.filesystem import readers
 from ingestr.src.google_sheets import google_spreadsheet
 from ingestr.src.gorgias import gorgias_source
 from ingestr.src.hubspot import hubspot
@@ -25,6 +28,11 @@ from ingestr.src.slack import slack_source
 from ingestr.src.sql_database import sql_table
 from ingestr.src.stripe_analytics import stripe_source
 from ingestr.src.table_definition import table_string_to_dataclass
+from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
+from ingestr.src.zendesk.helpers.credentials import (
+    ZendeskCredentialsOAuth,
+    ZendeskCredentialsToken,
+)
 class SqlSource:
@@ -310,8 +318,8 @@ class GoogleSheetsSource:
         table_fields = table_string_to_dataclass(table)
         return self.table_builder(
             credentials=credentials,
-            spreadsheet_url_or_id=table_fields.table,
-            range_names=[table_fields.dataset],
+            spreadsheet_url_or_id=table_fields.dataset,
+            range_names=[table_fields.table],
             get_named_ranges=False,
         )
@@ -734,3 +742,142 @@ class AppsflyerSource:
             start_date=start_date,
             end_date=end_date,
         ).with_resources(resource)
+class ZendeskSource:
+    def handles_incrementality(self) -> bool:
+        return True
+    def dlt_source(self, uri: str, table: str, **kwargs):
+        if kwargs.get("incremental_key"):
+            raise ValueError(
+                "Zendesk takes care of incrementality on its own, you should not provide incremental_key"
+            )
+        interval_start = kwargs.get("interval_start")
+        interval_end = kwargs.get("interval_end")
+        start_date = (
+            interval_start.strftime("%Y-%m-%d") if interval_start else "2000-01-01"
+        )
+        end_date = interval_end.strftime("%Y-%m-%d") if interval_end else None
+        source_fields = urlparse(uri)
+        subdomain = source_fields.hostname
+        if not subdomain:
+            raise ValueError("Subdomain is required to connect with Zendesk")
+        if not source_fields.username and source_fields.password:
+            oauth_token = source_fields.password
+            if not oauth_token:
+                raise ValueError(
+                    "oauth_token in the URI is required to connect to Zendesk"
+                )
+            credentials = ZendeskCredentialsOAuth(
+                subdomain=subdomain, oauth_token=oauth_token
+            )
+        elif source_fields.username and source_fields.password:
+            email = source_fields.username
+            api_token = source_fields.password
+            if not email or not api_token:
+                raise ValueError(
+                    "Both email and token must be provided to connect to Zendesk"
+                )
+            credentials = ZendeskCredentialsToken(
+                subdomain=subdomain, email=email, token=api_token
+            )
+        else:
+            raise ValueError("Invalid URI format")
+        if table in [
+            "ticket_metrics",
+            "users",
+            "ticket_metric_events",
+            "ticket_forms",
+            "tickets",
+            "targets",
+            "activities",
+            "brands",
+            "groups",
+            "organizations",
+            "sla_policies",
+            "automations",
+        ]:
+            return zendesk_support(
+                credentials=credentials, start_date=start_date, end_date=end_date
+            ).with_resources(table)
+        elif table in [
+            "greetings",
+            "settings",
+            "addresses",
+            "legs_incremental",
+            "calls",
+            "phone_numbers",
+            "lines",
+            "agents_activity",
+        ]:
+            return zendesk_talk(
+                credentials=credentials, start_date=start_date, end_date=end_date
+            ).with_resources(table)
+        elif table in ["chats"]:
+            return zendesk_chat(
+                credentials=credentials, start_date=start_date, end_date=end_date
+            ).with_resources(table)
+        else:
+            raise ValueError(
+                "fResource '{table}' is not supported for Zendesk source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
+            )
+class S3Source:
+    def handles_incrementality(self) -> bool:
+        return True
+    def dlt_source(self, uri: str, table: str, **kwargs):
+        if kwargs.get("incremental_key"):
+            raise ValueError(
+                "S3 takes care of incrementality on its own, you should not provide incremental_key"
+            )
+        parsed_uri = urlparse(uri)
+        source_fields = parse_qs(parsed_uri.query)
+        access_key_id = source_fields.get("access_key_id")
+        if not access_key_id:
+            raise ValueError("access_key_id is required to connect to S3")
+        secret_access_key = source_fields.get("secret_access_key")
+        if not secret_access_key:
+            raise ValueError("secret_access_key is required to connect to S3")
+        bucket_name = parsed_uri.hostname
+        if not bucket_name:
+            raise ValueError(
+                "Invalid S3 URI: The bucket name is missing. Ensure your S3 URI follows the format 's3://bucket-name/path/to/file"
+            )
+        bucket_url = f"s3://{bucket_name}"
+        path_to_file = parsed_uri.path.lstrip("/")
+        if not path_to_file:
+            raise ValueError(
+                "Invalid S3 URI: The file path is missing. Ensure your S3 URI follows the format 's3://bucket-name/path/to/file"
+            )
+        aws_credentials = AwsCredentials(
+            aws_access_key_id=access_key_id[0],
+            aws_secret_access_key=TSecretStrValue(secret_access_key[0]),
+        )
+        file_extension = path_to_file.split(".")[-1]
+        if file_extension == "csv":
+            endpoint = "read_csv"
+        elif file_extension == "jsonl":
+            endpoint = "read_jsonl"
+        elif file_extension == "parquet":
+            endpoint = "read_parquet"
+        else:
+            raise ValueError(
+                "S3 Source only supports specific formats files: csv, jsonl, parquet"
+            )
+        return readers(
+            bucket_url=bucket_url, credentials=aws_credentials, file_glob=path_to_file
+        ).with_resources(endpoint)

ingestr/src/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.8.3"
1	+ __version__ = "0.9.0"

ingestr 0.8.3__py3-none-any.whl → 0.9.0__py3-none-any.whl

Potentially problematic release.

ingestr 0.8.3py3-none-any.whl → 0.9.0py3-none-any.whl