ingestr 0.8.3__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/src/factory.py CHANGED
@@ -28,10 +28,12 @@ from ingestr.src.sources import (
28
28
  LocalCsvSource,
29
29
  MongoDbSource,
30
30
  NotionSource,
31
+ S3Source,
31
32
  ShopifySource,
32
33
  SlackSource,
33
34
  SqlSource,
34
35
  StripeAnalyticsSource,
36
+ ZendeskSource,
35
37
  )
36
38
 
37
39
  SQL_SOURCE_SCHEMES = [
@@ -132,6 +134,10 @@ class SourceDestinationFactory:
132
134
  return KafkaSource()
133
135
  elif self.source_scheme == "adjust":
134
136
  return AdjustSource()
137
+ elif self.source_scheme == "zendesk":
138
+ return ZendeskSource()
139
+ elif self.source_scheme == "s3":
140
+ return S3Source()
135
141
  else:
136
142
  raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
137
143
 
@@ -0,0 +1,98 @@
1
+ """Reads files in s3, gs or azure buckets using fsspec and provides convenience resources for chunked reading of various file formats"""
2
+
3
+ from typing import Iterator, List, Optional, Tuple, Union
4
+
5
+ import dlt
6
+ from dlt.sources import DltResource
7
+ from dlt.sources.credentials import FileSystemCredentials
8
+ from dlt.sources.filesystem import FileItem, FileItemDict, fsspec_filesystem, glob_files
9
+
10
+ from .helpers import (
11
+ AbstractFileSystem,
12
+ FilesystemConfigurationResource,
13
+ )
14
+ from .readers import (
15
+ ReadersSource,
16
+ _read_csv,
17
+ _read_csv_duckdb,
18
+ _read_jsonl,
19
+ _read_parquet,
20
+ )
21
+
22
+
23
+ @dlt.source(_impl_cls=ReadersSource, spec=FilesystemConfigurationResource)
24
+ def readers(
25
+ bucket_url: str,
26
+ credentials: Union[FileSystemCredentials, AbstractFileSystem],
27
+ file_glob: Optional[str] = "*",
28
+ ) -> Tuple[DltResource, ...]:
29
+ """This source provides a few resources that are chunked file readers. Readers can be further parametrized before use
30
+ read_csv(chunksize, **pandas_kwargs)
31
+ read_jsonl(chunksize)
32
+ read_parquet(chunksize)
33
+
34
+ Args:
35
+ bucket_url (str): The url to the bucket.
36
+ credentials (FileSystemCredentials | AbstractFilesystem): The credentials to the filesystem of fsspec `AbstractFilesystem` instance.
37
+ file_glob (str, optional): The filter to apply to the files in glob format. by default lists all files in bucket_url non-recursively
38
+ """
39
+ filesystem_resource = filesystem(bucket_url, credentials, file_glob=file_glob)
40
+ filesystem_resource.apply_hints(
41
+ incremental=dlt.sources.incremental("modification_date")
42
+ )
43
+ return (
44
+ filesystem_resource | dlt.transformer(name="read_csv")(_read_csv),
45
+ filesystem_resource | dlt.transformer(name="read_jsonl")(_read_jsonl),
46
+ filesystem_resource | dlt.transformer(name="read_parquet")(_read_parquet),
47
+ filesystem_resource | dlt.transformer(name="read_csv_duckdb")(_read_csv_duckdb),
48
+ )
49
+
50
+
51
+ @dlt.resource(
52
+ primary_key="file_url", spec=FilesystemConfigurationResource, standalone=True
53
+ )
54
+ def filesystem(
55
+ bucket_url: str = dlt.secrets.value,
56
+ credentials: Union[FileSystemCredentials, AbstractFileSystem] = dlt.secrets.value,
57
+ file_glob: Optional[str] = "*",
58
+ files_per_page: int = 100,
59
+ extract_content: bool = True,
60
+ ) -> Iterator[List[FileItem]]:
61
+ """This resource lists files in `bucket_url` using `file_glob` pattern. The files are yielded as FileItem which also
62
+ provide methods to open and read file data. It should be combined with transformers that further process (ie. load files)
63
+
64
+ Args:
65
+ bucket_url (str): The url to the bucket.
66
+ credentials (FileSystemCredentials | AbstractFilesystem): The credentials to the filesystem of fsspec `AbstractFilesystem` instance.
67
+ file_glob (str, optional): The filter to apply to the files in glob format. by default lists all files in bucket_url non-recursively
68
+ files_per_page (int, optional): The number of files to process at once, defaults to 100.
69
+ extract_content (bool, optional): If true, the content of the file will be extracted if
70
+ false it will return a fsspec file, defaults to False.
71
+
72
+ Returns:
73
+ Iterator[List[FileItem]]: The list of files.
74
+ """
75
+
76
+ if isinstance(credentials, AbstractFileSystem):
77
+ fs_client = credentials
78
+ else:
79
+ fs_client = fsspec_filesystem(bucket_url, credentials)[0]
80
+
81
+ files_chunk: List[FileItem] = []
82
+ for file_model in glob_files(fs_client, bucket_url, file_glob):
83
+ file_dict = FileItemDict(file_model, credentials)
84
+ if extract_content:
85
+ file_dict["file_content"] = file_dict.read_bytes()
86
+ files_chunk.append(file_dict) # type: ignore
87
+ # wait for the chunk to be full
88
+ if len(files_chunk) >= files_per_page:
89
+ yield files_chunk
90
+ files_chunk = []
91
+ if files_chunk:
92
+ yield files_chunk
93
+
94
+
95
+ read_csv = dlt.transformer(standalone=True)(_read_csv)
96
+ read_jsonl = dlt.transformer(standalone=True)(_read_jsonl)
97
+ read_parquet = dlt.transformer(standalone=True)(_read_parquet)
98
+ read_csv_duckdb = dlt.transformer(standalone=True)(_read_csv_duckdb)
@@ -0,0 +1,100 @@
1
+ """Helpers for the filesystem resource."""
2
+
3
+ from typing import Any, Dict, Iterable, List, Optional, Type, Union
4
+
5
+ import dlt
6
+ from dlt.common.configuration import resolve_type
7
+ from dlt.common.typing import TDataItem
8
+ from dlt.sources import DltResource
9
+ from dlt.sources.config import configspec, with_config
10
+ from dlt.sources.credentials import (
11
+ CredentialsConfiguration,
12
+ FilesystemConfiguration,
13
+ FileSystemCredentials,
14
+ )
15
+ from dlt.sources.filesystem import fsspec_filesystem
16
+ from fsspec import AbstractFileSystem # type: ignore
17
+
18
+
19
+ @configspec
20
+ class FilesystemConfigurationResource(FilesystemConfiguration):
21
+ credentials: Union[FileSystemCredentials, AbstractFileSystem] = None
22
+ file_glob: Optional[str] = "*"
23
+ files_per_page: int = 100
24
+ extract_content: bool = False
25
+
26
+ @resolve_type("credentials")
27
+ def resolve_credentials_type(self) -> Type[CredentialsConfiguration]:
28
+ # use known credentials or empty credentials for unknown protocol
29
+ return Union[
30
+ self.PROTOCOL_CREDENTIALS.get(self.protocol)
31
+ or Optional[CredentialsConfiguration],
32
+ AbstractFileSystem,
33
+ ] # type: ignore[return-value]
34
+
35
+
36
+ def fsspec_from_resource(filesystem_instance: DltResource) -> AbstractFileSystem:
37
+ """Extract authorized fsspec client from a filesystem resource"""
38
+
39
+ @with_config(
40
+ spec=FilesystemConfiguration,
41
+ sections=("sources", filesystem_instance.section, filesystem_instance.name),
42
+ )
43
+ def _get_fsspec(
44
+ bucket_url: str, credentials: Optional[FileSystemCredentials]
45
+ ) -> AbstractFileSystem:
46
+ return fsspec_filesystem(bucket_url, credentials)[0]
47
+
48
+ return _get_fsspec(
49
+ filesystem_instance.explicit_args.get("bucket_url", dlt.config.value),
50
+ filesystem_instance.explicit_args.get("credentials", dlt.secrets.value),
51
+ )
52
+
53
+
54
+ def add_columns(columns: List[str], rows: List[List[Any]]) -> List[Dict[str, Any]]:
55
+ """Adds column names to the given rows.
56
+
57
+ Args:
58
+ columns (List[str]): The column names.
59
+ rows (List[List[Any]]): The rows.
60
+
61
+ Returns:
62
+ List[Dict[str, Any]]: The rows with column names.
63
+ """
64
+ result = []
65
+ for row in rows:
66
+ result.append(dict(zip(columns, row)))
67
+
68
+ return result
69
+
70
+
71
+ def fetch_arrow(file_data, chunk_size: int) -> Iterable[TDataItem]: # type: ignore
72
+ """Fetches data from the given CSV file.
73
+
74
+ Args:
75
+ file_data (DuckDBPyRelation): The CSV file data.
76
+ chunk_size (int): The number of rows to read at once.
77
+
78
+ Yields:
79
+ Iterable[TDataItem]: Data items, read from the given CSV file.
80
+ """
81
+ batcher = file_data.fetch_arrow_reader(batch_size=chunk_size)
82
+ yield from batcher
83
+
84
+
85
+ def fetch_json(file_data, chunk_size: int) -> List[Dict[str, Any]]: # type: ignore
86
+ """Fetches data from the given CSV file.
87
+
88
+ Args:
89
+ file_data (DuckDBPyRelation): The CSV file data.
90
+ chunk_size (int): The number of rows to read at once.
91
+
92
+ Yields:
93
+ Iterable[TDataItem]: Data items, read from the given CSV file.
94
+ """
95
+ while True:
96
+ batch = file_data.fetchmany(chunk_size)
97
+ if not batch:
98
+ break
99
+
100
+ yield add_columns(file_data.columns, batch)
@@ -0,0 +1,131 @@
1
+ from typing import TYPE_CHECKING, Any, Iterator, Optional
2
+
3
+ from dlt.common import json
4
+ from dlt.common.typing import copy_sig
5
+ from dlt.sources import DltResource, DltSource, TDataItems
6
+ from dlt.sources.filesystem import FileItemDict
7
+
8
+ from .helpers import fetch_arrow, fetch_json
9
+
10
+
11
+ def _read_csv(
12
+ items: Iterator[FileItemDict], chunksize: int = 10000, **pandas_kwargs: Any
13
+ ) -> Iterator[TDataItems]:
14
+ """Reads csv file with Pandas chunk by chunk.
15
+
16
+ Args:
17
+ chunksize (int): Number of records to read in one chunk
18
+ **pandas_kwargs: Additional keyword arguments passed to Pandas.read_csv
19
+ Returns:
20
+ TDataItem: The file content
21
+ """
22
+ import pandas as pd
23
+
24
+ # apply defaults to pandas kwargs
25
+ kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs}
26
+
27
+ for file_obj in items:
28
+ # Here we use pandas chunksize to read the file in chunks and avoid loading the whole file
29
+ # in memory.
30
+ with file_obj.open() as file:
31
+ for df in pd.read_csv(file, **kwargs):
32
+ yield df.to_dict(orient="records")
33
+
34
+
35
+ def _read_jsonl(
36
+ items: Iterator[FileItemDict], chunksize: int = 1000
37
+ ) -> Iterator[TDataItems]:
38
+ """Reads jsonl file content and extract the data.
39
+
40
+ Args:
41
+ chunksize (int, optional): The number of JSON lines to load and yield at once, defaults to 1000
42
+
43
+ Returns:
44
+ TDataItem: The file content
45
+ """
46
+ for file_obj in items:
47
+ with file_obj.open() as f:
48
+ lines_chunk = []
49
+ for line in f:
50
+ lines_chunk.append(json.loadb(line))
51
+ if len(lines_chunk) >= chunksize:
52
+ yield lines_chunk
53
+ lines_chunk = []
54
+ if lines_chunk:
55
+ yield lines_chunk
56
+
57
+
58
+ def _read_parquet(
59
+ items: Iterator[FileItemDict],
60
+ chunksize: int = 10,
61
+ ) -> Iterator[TDataItems]:
62
+ """Reads parquet file content and extract the data.
63
+
64
+ Args:
65
+ chunksize (int, optional): The number of files to process at once, defaults to 10.
66
+
67
+ Returns:
68
+ TDataItem: The file content
69
+ """
70
+ from pyarrow import parquet as pq
71
+
72
+ for file_obj in items:
73
+ with file_obj.open() as f:
74
+ parquet_file = pq.ParquetFile(f)
75
+ for rows in parquet_file.iter_batches(batch_size=chunksize):
76
+ yield rows.to_pylist()
77
+
78
+
79
+ def _read_csv_duckdb(
80
+ items: Iterator[FileItemDict],
81
+ chunk_size: Optional[int] = 5000,
82
+ use_pyarrow: bool = False,
83
+ **duckdb_kwargs: Any,
84
+ ) -> Iterator[TDataItems]:
85
+ """A resource to extract data from the given CSV files.
86
+
87
+ Uses DuckDB engine to import and cast CSV data.
88
+
89
+ Args:
90
+ items (Iterator[FileItemDict]): CSV files to read.
91
+ chunk_size (Optional[int]):
92
+ The number of rows to read at once. Defaults to 5000.
93
+ use_pyarrow (bool):
94
+ Whether to use `pyarrow` to read the data and designate
95
+ data schema. If set to False (by default), JSON is used.
96
+ duckdb_kwargs (Dict):
97
+ Additional keyword arguments to pass to the `read_csv()`.
98
+
99
+ Returns:
100
+ Iterable[TDataItem]: Data items, read from the given CSV files.
101
+ """
102
+ import duckdb
103
+
104
+ helper = fetch_arrow if use_pyarrow else fetch_json
105
+
106
+ for item in items:
107
+ with item.open() as f:
108
+ file_data = duckdb.from_csv_auto(f, **duckdb_kwargs) # type: ignore
109
+
110
+ yield from helper(file_data, chunk_size)
111
+
112
+
113
+ if TYPE_CHECKING:
114
+
115
+ class ReadersSource(DltSource):
116
+ """This is a typing stub that provides docstrings and signatures to the resources in `readers" source"""
117
+
118
+ @copy_sig(_read_csv)
119
+ def read_csv(self) -> DltResource: ...
120
+
121
+ @copy_sig(_read_jsonl)
122
+ def read_jsonl(self) -> DltResource: ...
123
+
124
+ @copy_sig(_read_parquet)
125
+ def read_parquet(self) -> DltResource: ...
126
+
127
+ @copy_sig(_read_csv_duckdb)
128
+ def read_csv_duckdb(self) -> DltResource: ...
129
+
130
+ else:
131
+ ReadersSource = DltSource
@@ -1,8 +1,9 @@
1
1
  """Fetches Shopify Orders and Products."""
2
2
 
3
- from typing import Iterable, Optional
3
+ from typing import Any, Dict, Iterable, Optional # noqa: F401
4
4
 
5
5
  import dlt
6
+ from dlt.common import jsonpath as jp # noqa: F401
6
7
  from dlt.common import pendulum
7
8
  from dlt.common.time import ensure_pendulum_datetime
8
9
  from dlt.common.typing import TAnyDateTime, TDataItem
@@ -12,6 +13,7 @@ from .helpers import ShopifyApi, ShopifyGraphQLApi, TOrderStatus
12
13
  from .settings import (
13
14
  DEFAULT_API_VERSION,
14
15
  DEFAULT_ITEMS_PER_PAGE,
16
+ DEFAULT_PARTNER_API_VERSION, # noqa: F401
15
17
  FIRST_DAY_OF_MILLENNIUM,
16
18
  )
17
19
 
@@ -158,8 +158,8 @@ class ShopifyGraphQLApi:
158
158
  query: str,
159
159
  data_items_path: jsonpath.TJsonPath,
160
160
  pagination_cursor_path: jsonpath.TJsonPath,
161
- pagination_cursor_has_next_page_path: jsonpath.TJsonPath,
162
161
  pagination_variable_name: str,
162
+ pagination_cursor_has_next_page_path: Optional[jsonpath.TJsonPath] = None,
163
163
  variables: Optional[DictStrAny] = None,
164
164
  ) -> Iterable[TDataItems]:
165
165
  variables = dict(variables or {})
ingestr/src/sources.py CHANGED
@@ -6,12 +6,15 @@ from typing import Any, Callable, Optional
6
6
  from urllib.parse import parse_qs, urlparse
7
7
 
8
8
  import dlt
9
+ from dlt.common.configuration.specs import AwsCredentials
10
+ from dlt.common.typing import TSecretStrValue
9
11
 
10
12
  from ingestr.src.adjust._init_ import adjust_source
11
13
  from ingestr.src.airtable import airtable_source
12
14
  from ingestr.src.appsflyer._init_ import appsflyer_source
13
15
  from ingestr.src.chess import source
14
16
  from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
17
+ from ingestr.src.filesystem import readers
15
18
  from ingestr.src.google_sheets import google_spreadsheet
16
19
  from ingestr.src.gorgias import gorgias_source
17
20
  from ingestr.src.hubspot import hubspot
@@ -25,6 +28,11 @@ from ingestr.src.slack import slack_source
25
28
  from ingestr.src.sql_database import sql_table
26
29
  from ingestr.src.stripe_analytics import stripe_source
27
30
  from ingestr.src.table_definition import table_string_to_dataclass
31
+ from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
32
+ from ingestr.src.zendesk.helpers.credentials import (
33
+ ZendeskCredentialsOAuth,
34
+ ZendeskCredentialsToken,
35
+ )
28
36
 
29
37
 
30
38
  class SqlSource:
@@ -310,8 +318,8 @@ class GoogleSheetsSource:
310
318
  table_fields = table_string_to_dataclass(table)
311
319
  return self.table_builder(
312
320
  credentials=credentials,
313
- spreadsheet_url_or_id=table_fields.table,
314
- range_names=[table_fields.dataset],
321
+ spreadsheet_url_or_id=table_fields.dataset,
322
+ range_names=[table_fields.table],
315
323
  get_named_ranges=False,
316
324
  )
317
325
 
@@ -734,3 +742,142 @@ class AppsflyerSource:
734
742
  start_date=start_date,
735
743
  end_date=end_date,
736
744
  ).with_resources(resource)
745
+
746
+
747
+ class ZendeskSource:
748
+ def handles_incrementality(self) -> bool:
749
+ return True
750
+
751
+ def dlt_source(self, uri: str, table: str, **kwargs):
752
+ if kwargs.get("incremental_key"):
753
+ raise ValueError(
754
+ "Zendesk takes care of incrementality on its own, you should not provide incremental_key"
755
+ )
756
+
757
+ interval_start = kwargs.get("interval_start")
758
+ interval_end = kwargs.get("interval_end")
759
+ start_date = (
760
+ interval_start.strftime("%Y-%m-%d") if interval_start else "2000-01-01"
761
+ )
762
+ end_date = interval_end.strftime("%Y-%m-%d") if interval_end else None
763
+
764
+ source_fields = urlparse(uri)
765
+ subdomain = source_fields.hostname
766
+ if not subdomain:
767
+ raise ValueError("Subdomain is required to connect with Zendesk")
768
+
769
+ if not source_fields.username and source_fields.password:
770
+ oauth_token = source_fields.password
771
+ if not oauth_token:
772
+ raise ValueError(
773
+ "oauth_token in the URI is required to connect to Zendesk"
774
+ )
775
+ credentials = ZendeskCredentialsOAuth(
776
+ subdomain=subdomain, oauth_token=oauth_token
777
+ )
778
+ elif source_fields.username and source_fields.password:
779
+ email = source_fields.username
780
+ api_token = source_fields.password
781
+ if not email or not api_token:
782
+ raise ValueError(
783
+ "Both email and token must be provided to connect to Zendesk"
784
+ )
785
+ credentials = ZendeskCredentialsToken(
786
+ subdomain=subdomain, email=email, token=api_token
787
+ )
788
+ else:
789
+ raise ValueError("Invalid URI format")
790
+
791
+ if table in [
792
+ "ticket_metrics",
793
+ "users",
794
+ "ticket_metric_events",
795
+ "ticket_forms",
796
+ "tickets",
797
+ "targets",
798
+ "activities",
799
+ "brands",
800
+ "groups",
801
+ "organizations",
802
+ "sla_policies",
803
+ "automations",
804
+ ]:
805
+ return zendesk_support(
806
+ credentials=credentials, start_date=start_date, end_date=end_date
807
+ ).with_resources(table)
808
+ elif table in [
809
+ "greetings",
810
+ "settings",
811
+ "addresses",
812
+ "legs_incremental",
813
+ "calls",
814
+ "phone_numbers",
815
+ "lines",
816
+ "agents_activity",
817
+ ]:
818
+ return zendesk_talk(
819
+ credentials=credentials, start_date=start_date, end_date=end_date
820
+ ).with_resources(table)
821
+ elif table in ["chats"]:
822
+ return zendesk_chat(
823
+ credentials=credentials, start_date=start_date, end_date=end_date
824
+ ).with_resources(table)
825
+ else:
826
+ raise ValueError(
827
+ "fResource '{table}' is not supported for Zendesk source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
828
+ )
829
+
830
+
831
+ class S3Source:
832
+ def handles_incrementality(self) -> bool:
833
+ return True
834
+
835
+ def dlt_source(self, uri: str, table: str, **kwargs):
836
+ if kwargs.get("incremental_key"):
837
+ raise ValueError(
838
+ "S3 takes care of incrementality on its own, you should not provide incremental_key"
839
+ )
840
+
841
+ parsed_uri = urlparse(uri)
842
+ source_fields = parse_qs(parsed_uri.query)
843
+ access_key_id = source_fields.get("access_key_id")
844
+ if not access_key_id:
845
+ raise ValueError("access_key_id is required to connect to S3")
846
+
847
+ secret_access_key = source_fields.get("secret_access_key")
848
+ if not secret_access_key:
849
+ raise ValueError("secret_access_key is required to connect to S3")
850
+
851
+ bucket_name = parsed_uri.hostname
852
+ if not bucket_name:
853
+ raise ValueError(
854
+ "Invalid S3 URI: The bucket name is missing. Ensure your S3 URI follows the format 's3://bucket-name/path/to/file"
855
+ )
856
+ bucket_url = f"s3://{bucket_name}"
857
+
858
+ path_to_file = parsed_uri.path.lstrip("/")
859
+ if not path_to_file:
860
+ raise ValueError(
861
+ "Invalid S3 URI: The file path is missing. Ensure your S3 URI follows the format 's3://bucket-name/path/to/file"
862
+ )
863
+
864
+ aws_credentials = AwsCredentials(
865
+ aws_access_key_id=access_key_id[0],
866
+ aws_secret_access_key=TSecretStrValue(secret_access_key[0]),
867
+ )
868
+
869
+ file_extension = path_to_file.split(".")[-1]
870
+ if file_extension == "csv":
871
+ endpoint = "read_csv"
872
+ elif file_extension == "jsonl":
873
+ endpoint = "read_jsonl"
874
+ elif file_extension == "parquet":
875
+ endpoint = "read_parquet"
876
+ else:
877
+ raise ValueError(
878
+ "S3 Source only supports specific formats files: csv, jsonl, parquet"
879
+ )
880
+
881
+ return readers(
882
+ bucket_url=bucket_url, credentials=aws_credentials, file_glob=path_to_file
883
+ ).with_resources(endpoint)
ingestr/src/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.8.3"
1
+ __version__ = "0.9.0"