ingestr 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -0,0 +1,149 @@
1
+ """Gorgias source helpers"""
2
+
3
+ from typing import Any, Iterable, Optional, Tuple
4
+
5
+ from dlt.common.pendulum import pendulum
6
+ from dlt.common.time import ensure_pendulum_datetime
7
+ from dlt.common.typing import Dict, TDataItems
8
+ from dlt.sources.helpers import requests
9
+ from pyrate_limiter import Duration, Limiter, Rate
10
+ from requests.auth import HTTPBasicAuth
11
+
12
+
13
+ def get_max_datetime_from_datetime_fields(
14
+ item: Dict[str, Any],
15
+ ) -> Tuple[str, Optional[pendulum.DateTime]]:
16
+ """Get the maximum datetime from any field that ends with _datetime"""
17
+
18
+ max_field_name = None
19
+ max_field_value = None
20
+ for field in item:
21
+ if field.endswith("_datetime") and item[field] is not None:
22
+ dt = ensure_pendulum_datetime(item[field])
23
+ if not max_field_name or dt > max_field_value:
24
+ max_field_name = field
25
+ max_field_value = dt
26
+
27
+ return max_field_name, max_field_value
28
+
29
+
30
+ def convert_datetime_fields(item: Dict[str, Any]) -> Dict[str, Any]:
31
+ for field in item:
32
+ if field.endswith("_datetime") and item[field] is not None:
33
+ item[field] = ensure_pendulum_datetime(item[field])
34
+
35
+ if "updated_datetime" not in item:
36
+ _, max_datetime = get_max_datetime_from_datetime_fields(item)
37
+ item["updated_datetime"] = max_datetime
38
+
39
+ return item
40
+
41
+
42
+ def find_latest_timestamp_from_page(
43
+ items: list[Dict[str, Any]],
44
+ ) -> Optional[Dict[str, Any]]:
45
+ latest_time = None
46
+ for item in items:
47
+ _, max_field_value = get_max_datetime_from_datetime_fields(item)
48
+ if not latest_time or ensure_pendulum_datetime(max_field_value) > latest_time:
49
+ latest_time = max_field_value
50
+
51
+ return latest_time
52
+
53
+
54
+ class GorgiasApi:
55
+ """
56
+ A Gorgias API client that can be used to get pages of data from Gorgias.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ domain: str,
62
+ email: str,
63
+ api_key: str,
64
+ ) -> None:
65
+ """
66
+ Args:
67
+ domain: The domain of your Gorgias account.
68
+ email: The email associated with your Gorgias account.
69
+ api_key: The API key for accessing the Gorgias API.
70
+ """
71
+ self.domain = domain
72
+ self.email = email
73
+ self.api_key = api_key
74
+
75
+ def get_pages(
76
+ self,
77
+ resource: str,
78
+ params: Optional[Dict[str, Any]] = None,
79
+ start_date: Optional[str] = None,
80
+ end_date: Optional[str] = None,
81
+ ) -> Iterable[TDataItems]:
82
+ """Get all pages from Gorgias using requests.
83
+ Iterates through all pages and yield each page items.
84
+
85
+ Args:
86
+ resource: The resource to get pages for (e.g. products, orders, customers).
87
+ params: Query params to include in the request.
88
+
89
+ Yields:
90
+ List of data items from the page
91
+ """
92
+ url = f"https://{self.domain}.gorgias.com/api/{resource}"
93
+ rate = Rate(2, Duration.SECOND)
94
+ limiter = Limiter(rate, raise_when_fail=False)
95
+
96
+ start_date_obj = ensure_pendulum_datetime(start_date) if start_date else None
97
+
98
+ if not params:
99
+ params = {}
100
+
101
+ params["limit"] = 100
102
+ if "order_by" not in params:
103
+ params["order_by"] = "updated_datetime:desc"
104
+
105
+ while True:
106
+ limiter.try_acquire(f"gorgias-{self.domain}")
107
+ response = requests.get(
108
+ url, params=params, auth=HTTPBasicAuth(self.email, self.api_key)
109
+ )
110
+ response.raise_for_status()
111
+ if len(response.json()["data"]) == 0:
112
+ break
113
+
114
+ json = response.json()
115
+
116
+ items = self.__filter_items_in_range(json["data"], start_date, end_date)
117
+ if len(items) > 0:
118
+ yield items
119
+
120
+ # if there is no cursor, yield the items first and then break the loop
121
+ cursor = json.get("meta", {}).get("next_cursor")
122
+ params["cursor"] = cursor
123
+ if not cursor:
124
+ break
125
+
126
+ if start_date_obj:
127
+ max_datetime = find_latest_timestamp_from_page(json["data"])
128
+ if start_date_obj > ensure_pendulum_datetime(max_datetime):
129
+ break
130
+
131
+ def __filter_items_in_range(
132
+ self,
133
+ items: list[Dict[str, Any]],
134
+ start_date: Optional[str],
135
+ end_date: Optional[str],
136
+ ) -> list[Dict[str, Any]]:
137
+ start_date_obj = ensure_pendulum_datetime(start_date) if start_date else None
138
+ end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None
139
+
140
+ filtered = []
141
+ for item in items:
142
+ converted_item = convert_datetime_fields(item)
143
+ if start_date_obj and item["updated_datetime"] < start_date_obj:
144
+ continue
145
+ if end_date_obj and item["updated_datetime"] > end_date_obj:
146
+ continue
147
+ filtered.append(converted_item)
148
+
149
+ return filtered
@@ -0,0 +1,45 @@
1
+ from dlt.common.pendulum import pendulum
2
+
3
+ from .helpers import convert_datetime_fields, find_latest_timestamp_from_page
4
+
5
+
6
+ def test_convert_datetime_fields():
7
+ item = {
8
+ "key1": "val1",
9
+ "created_datetime": "2024-06-20T07:39:36.514848+00:00",
10
+ "sent_datetime": "2024-06-20T07:40:20.166593+00:00",
11
+ "should_send_datetime": "2024-06-20T07:39:37.514848+00:00",
12
+ }
13
+
14
+ actual = convert_datetime_fields(item)
15
+
16
+ assert actual == {
17
+ "key1": "val1",
18
+ "created_datetime": pendulum.datetime(2024, 6, 20, 7, 39, 36, 514848, tz="UTC"),
19
+ "sent_datetime": pendulum.datetime(2024, 6, 20, 7, 40, 20, 166593, tz="UTC"),
20
+ "should_send_datetime": pendulum.datetime(
21
+ 2024, 6, 20, 7, 39, 37, 514848, tz="UTC"
22
+ ),
23
+ "updated_datetime": pendulum.datetime(2024, 6, 20, 7, 40, 20, 166593, tz="UTC"),
24
+ }
25
+
26
+
27
+ def test_find_latest_timestamp_from_page():
28
+ items = [
29
+ {
30
+ "key1": "val1",
31
+ "created_datetime": "2024-06-20T07:39:36.514848+00:00",
32
+ "sent_datetime": "2024-06-20T07:40:20.166593+00:00",
33
+ "should_send_datetime": "2024-06-20T07:39:37.514848+00:00",
34
+ },
35
+ {
36
+ "key1": "val2",
37
+ "created_datetime": "2024-06-20T07:39:36.514848+00:00",
38
+ "sent_datetime": "2024-06-20T07:40:21.123123+00:00",
39
+ "should_send_datetime": "2024-06-20T07:39:37.514848+00:00",
40
+ },
41
+ ]
42
+
43
+ actual = find_latest_timestamp_from_page(items)
44
+
45
+ assert actual == pendulum.datetime(2024, 6, 20, 7, 40, 21, 123123, tz="UTC")
@@ -0,0 +1,227 @@
1
+ """Fetches Shopify Orders and Products."""
2
+
3
+ from typing import Any, Dict, Iterable, Optional
4
+
5
+ import dlt
6
+ from dlt.common import jsonpath as jp
7
+ from dlt.common import pendulum
8
+ from dlt.common.time import ensure_pendulum_datetime
9
+ from dlt.common.typing import TAnyDateTime, TDataItem
10
+ from dlt.sources import DltResource
11
+
12
+ from .helpers import ShopifyApi, ShopifyPartnerApi, TOrderStatus
13
+ from .settings import (
14
+ DEFAULT_API_VERSION,
15
+ DEFAULT_ITEMS_PER_PAGE,
16
+ DEFAULT_PARTNER_API_VERSION,
17
+ FIRST_DAY_OF_MILLENNIUM,
18
+ )
19
+
20
+
21
+ @dlt.source(name="shopify", max_table_nesting=0)
22
+ def shopify_source(
23
+ private_app_password: str = dlt.secrets.value,
24
+ api_version: str = DEFAULT_API_VERSION,
25
+ shop_url: str = dlt.config.value,
26
+ start_date: TAnyDateTime = FIRST_DAY_OF_MILLENNIUM,
27
+ end_date: Optional[TAnyDateTime] = None,
28
+ created_at_min: TAnyDateTime = FIRST_DAY_OF_MILLENNIUM,
29
+ items_per_page: int = DEFAULT_ITEMS_PER_PAGE,
30
+ order_status: TOrderStatus = "any",
31
+ ) -> Iterable[DltResource]:
32
+ """
33
+ The source for the Shopify pipeline. Available resources are products, orders, and customers.
34
+
35
+ `start_time` argument can be used on its own or together with `end_time`. When both are provided
36
+ data is limited to items updated in that time range.
37
+ The range is "half-open", meaning elements equal and newer than `start_time` and elements older than `end_time` are included.
38
+ All resources opt-in to use Airflow scheduler if run as Airflow task
39
+
40
+ Args:
41
+ private_app_password: The app password to the app on your shop.
42
+ api_version: The API version to use (e.g. 2023-01).
43
+ shop_url: The URL of your shop (e.g. https://my-shop.myshopify.com).
44
+ items_per_page: The max number of items to fetch per page. Defaults to 250.
45
+ start_date: Items updated on or after this date are imported. Defaults to 2000-01-01.
46
+ If end date is not provided, this is used as the initial value for incremental loading and after the initial run, only new data will be retrieved.
47
+ Accepts any `date`/`datetime` object or a date/datetime string in ISO 8601 format.
48
+ end_time: The end time of the range for which to load data.
49
+ Should be used together with `start_date` to limit the data to items updated in that time range.
50
+ If end time is not provided, the incremental loading will be enabled and after initial run, only new data will be retrieved
51
+ created_at_min: The minimum creation date of items to import. Items created on or after this date are loaded. Defaults to 2000-01-01.
52
+ order_status: The order status to filter by. Can be 'open', 'closed', 'cancelled', or 'any'. Defaults to 'any'.
53
+
54
+ Returns:
55
+ Iterable[DltResource]: A list of DltResource objects representing the data resources.
56
+ """
57
+
58
+ # build client
59
+ client = ShopifyApi(shop_url, private_app_password, api_version)
60
+
61
+ start_date_obj = ensure_pendulum_datetime(start_date)
62
+ end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None
63
+ created_at_min_obj = ensure_pendulum_datetime(created_at_min)
64
+
65
+ # define resources
66
+ @dlt.resource(primary_key="id", write_disposition="merge")
67
+ def products(
68
+ updated_at: dlt.sources.incremental[
69
+ pendulum.DateTime
70
+ ] = dlt.sources.incremental(
71
+ "updated_at",
72
+ initial_value=start_date_obj,
73
+ end_value=end_date_obj,
74
+ allow_external_schedulers=True,
75
+ ),
76
+ created_at_min: pendulum.DateTime = created_at_min_obj,
77
+ items_per_page: int = items_per_page,
78
+ ) -> Iterable[TDataItem]:
79
+ """
80
+ The resource for products on your shop, supports incremental loading and pagination.
81
+
82
+ Args:
83
+ updated_at: The saved state of the last 'updated_at' value.
84
+
85
+ Returns:
86
+ Iterable[TDataItem]: A generator of products.
87
+ """
88
+ params = dict(
89
+ updated_at_min=updated_at.last_value.isoformat(),
90
+ limit=items_per_page,
91
+ order="updated_at asc",
92
+ created_at_min=created_at_min.isoformat(),
93
+ )
94
+ if updated_at.end_value is not None:
95
+ params["updated_at_max"] = updated_at.end_value.isoformat()
96
+ yield from client.get_pages("products", params)
97
+
98
+ @dlt.resource(primary_key="id", write_disposition="merge")
99
+ def orders(
100
+ updated_at: dlt.sources.incremental[
101
+ pendulum.DateTime
102
+ ] = dlt.sources.incremental(
103
+ "updated_at",
104
+ initial_value=start_date_obj,
105
+ end_value=end_date_obj,
106
+ allow_external_schedulers=True,
107
+ ),
108
+ created_at_min: pendulum.DateTime = created_at_min_obj,
109
+ items_per_page: int = items_per_page,
110
+ status: TOrderStatus = order_status,
111
+ ) -> Iterable[TDataItem]:
112
+ """
113
+ The resource for orders on your shop, supports incremental loading and pagination.
114
+
115
+ Args:
116
+ updated_at: The saved state of the last 'updated_at' value.
117
+
118
+ Returns:
119
+ Iterable[TDataItem]: A generator of orders.
120
+ """
121
+ params = dict(
122
+ updated_at_min=updated_at.last_value.isoformat(),
123
+ limit=items_per_page,
124
+ status=status,
125
+ order="updated_at asc",
126
+ created_at_min=created_at_min.isoformat(),
127
+ )
128
+ if updated_at.end_value is not None:
129
+ params["updated_at_max"] = updated_at.end_value.isoformat()
130
+ yield from client.get_pages("orders", params)
131
+
132
+ @dlt.resource(primary_key="id", write_disposition="merge")
133
+ def customers(
134
+ updated_at: dlt.sources.incremental[
135
+ pendulum.DateTime
136
+ ] = dlt.sources.incremental(
137
+ "updated_at",
138
+ initial_value=start_date_obj,
139
+ end_value=end_date_obj,
140
+ allow_external_schedulers=True,
141
+ ),
142
+ created_at_min: pendulum.DateTime = created_at_min_obj,
143
+ items_per_page: int = items_per_page,
144
+ ) -> Iterable[TDataItem]:
145
+ """
146
+ The resource for customers on your shop, supports incremental loading and pagination.
147
+
148
+ Args:
149
+ updated_at: The saved state of the last 'updated_at' value.
150
+
151
+ Returns:
152
+ Iterable[TDataItem]: A generator of customers.
153
+ """
154
+ params = dict(
155
+ updated_at_min=updated_at.last_value.isoformat(),
156
+ limit=items_per_page,
157
+ order="updated_at asc",
158
+ created_at_min=created_at_min.isoformat(),
159
+ )
160
+ if updated_at.end_value is not None:
161
+ params["updated_at_max"] = updated_at.end_value.isoformat()
162
+ yield from client.get_pages("customers", params)
163
+
164
+ return (products, orders, customers)
165
+
166
+
167
+ @dlt.resource
168
+ def shopify_partner_query(
169
+ query: str,
170
+ data_items_path: jp.TJsonPath,
171
+ pagination_cursor_path: jp.TJsonPath,
172
+ pagination_variable_name: str = "after",
173
+ variables: Optional[Dict[str, Any]] = None,
174
+ access_token: str = dlt.secrets.value,
175
+ organization_id: str = dlt.config.value,
176
+ api_version: str = DEFAULT_PARTNER_API_VERSION,
177
+ ) -> Iterable[TDataItem]:
178
+ """
179
+ Resource for getting paginated results from the Shopify Partner GraphQL API.
180
+
181
+ This resource will run the given GraphQL query and extract a list of data items from the result.
182
+ It will then run the query again with a pagination cursor to get the next page of results.
183
+
184
+ Example:
185
+ query = '''query Transactions($after: String) {
186
+ transactions(after: $after, first: 100) {
187
+ edges {
188
+ cursor
189
+ node {
190
+ id
191
+ }
192
+ }
193
+ }
194
+ }'''
195
+
196
+ partner_query_pages(
197
+ query,
198
+ data_items_path="data.transactions.edges[*].node",
199
+ pagination_cursor_path="data.transactions.edges[-1].cursor",
200
+ pagination_variable_name="after",
201
+ )
202
+
203
+ Args:
204
+ query: The GraphQL query to run.
205
+ data_items_path: The JSONPath to the data items in the query result. Should resolve to array items.
206
+ pagination_cursor_path: The JSONPath to the pagination cursor in the query result, will be piped to the next query via variables.
207
+ pagination_variable_name: The name of the variable to pass the pagination cursor to.
208
+ variables: Mapping of extra variables used in the query.
209
+ access_token: The Partner API Client access token, created in the Partner Dashboard.
210
+ organization_id: Your Organization ID, found in the Partner Dashboard.
211
+ api_version: The API version to use (e.g. 2024-01). Use `unstable` for the latest version.
212
+ Returns:
213
+ Iterable[TDataItem]: A generator of the query results.
214
+ """
215
+ client = ShopifyPartnerApi(
216
+ access_token=access_token,
217
+ organization_id=organization_id,
218
+ api_version=api_version,
219
+ )
220
+
221
+ yield from client.get_graphql_pages(
222
+ query,
223
+ data_items_path=data_items_path,
224
+ pagination_cursor_path=pagination_cursor_path,
225
+ pagination_variable_name=pagination_variable_name,
226
+ variables=variables,
227
+ )
@@ -0,0 +1,2 @@
1
+ class ShopifyPartnerApiError(Exception):
2
+ pass
@@ -0,0 +1,147 @@
1
+ """Shopify source helpers"""
2
+
3
+ from typing import Any, Iterable, Literal, Optional
4
+ from urllib.parse import urljoin
5
+
6
+ from dlt.common import jsonpath
7
+ from dlt.common.time import ensure_pendulum_datetime
8
+ from dlt.common.typing import Dict, DictStrAny, TDataItems
9
+ from dlt.sources.helpers import requests
10
+
11
+ from .exceptions import ShopifyPartnerApiError
12
+ from .settings import DEFAULT_API_VERSION, DEFAULT_PARTNER_API_VERSION
13
+
14
+ TOrderStatus = Literal["open", "closed", "cancelled", "any"]
15
+
16
+
17
+ class ShopifyApi:
18
+ """
19
+ A Shopify API client that can be used to get pages of data from Shopify.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ shop_url: str,
25
+ private_app_password: str,
26
+ api_version: str = DEFAULT_API_VERSION,
27
+ ) -> None:
28
+ """
29
+ Args:
30
+ shop_url: The URL of your shop (e.g. https://my-shop.myshopify.com).
31
+ private_app_password: The private app password to the app on your shop.
32
+ api_version: The API version to use (e.g. 2023-01)
33
+ """
34
+ self.shop_url = shop_url
35
+ self.private_app_password = private_app_password
36
+ self.api_version = api_version
37
+
38
+ def get_pages(
39
+ self, resource: str, params: Optional[Dict[str, Any]] = None
40
+ ) -> Iterable[TDataItems]:
41
+ """Get all pages from shopify using requests.
42
+ Iterates through all pages and yield each page items.
43
+
44
+ Args:
45
+ resource: The resource to get pages for (e.g. products, orders, customers).
46
+ params: Query params to include in the request.
47
+
48
+ Yields:
49
+ List of data items from the page
50
+ """
51
+ url = urljoin(self.shop_url, f"/admin/api/{self.api_version}/{resource}.json")
52
+
53
+ headers = {"X-Shopify-Access-Token": self.private_app_password}
54
+ while url:
55
+ response = requests.get(url, params=params, headers=headers)
56
+ response.raise_for_status()
57
+ json = response.json()
58
+ # Get item list from the page
59
+ yield [self._convert_datetime_fields(item) for item in json[resource]]
60
+ url = response.links.get("next", {}).get("url")
61
+ # Query params are included in subsequent page URLs
62
+ params = None
63
+
64
+ def _convert_datetime_fields(self, item: Dict[str, Any]) -> Dict[str, Any]:
65
+ """Convert timestamp fields in the item to pendulum datetime objects
66
+
67
+ The item is modified in place.
68
+
69
+ Args:
70
+ item: The item to convert
71
+
72
+ Returns:
73
+ The same data item (for convenience)
74
+ """
75
+ fields = ["created_at", "updated_at"]
76
+ for field in fields:
77
+ if field in item:
78
+ item[field] = ensure_pendulum_datetime(item[field])
79
+ return item
80
+
81
+
82
+ class ShopifyPartnerApi:
83
+ """Client for Shopify Partner grapql API"""
84
+
85
+ def __init__(
86
+ self,
87
+ access_token: str,
88
+ organization_id: str,
89
+ api_version: str = DEFAULT_PARTNER_API_VERSION,
90
+ ) -> None:
91
+ """
92
+ Args:
93
+ access_token: The access token to use
94
+ organization_id: The organization id to query
95
+ api_version: The API version to use (e.g. 2023-01)
96
+ """
97
+ self.access_token = access_token
98
+ self.organization_id = organization_id
99
+ self.api_version = api_version
100
+
101
+ @property
102
+ def graphql_url(self) -> str:
103
+ return f"https://partners.shopify.com/{self.organization_id}/api/{self.api_version}/graphql.json"
104
+
105
+ def run_graphql_query(
106
+ self, query: str, variables: Optional[DictStrAny] = None
107
+ ) -> DictStrAny:
108
+ """Run a graphql query against the Shopify Partner API
109
+
110
+ Args:
111
+ query: The query to run
112
+ variables: The variables to include in the query
113
+
114
+ Returns:
115
+ The response JSON
116
+ """
117
+ headers = {"X-Shopify-Access-Token": self.access_token}
118
+ response = requests.post(
119
+ self.graphql_url,
120
+ json={"query": query, "variables": variables},
121
+ headers=headers,
122
+ )
123
+ data = response.json()
124
+ if data.get("errors"):
125
+ raise ShopifyPartnerApiError(response.text)
126
+ return data # type: ignore[no-any-return]
127
+
128
+ def get_graphql_pages(
129
+ self,
130
+ query: str,
131
+ data_items_path: jsonpath.TJsonPath,
132
+ pagination_cursor_path: jsonpath.TJsonPath,
133
+ pagination_variable_name: str,
134
+ variables: Optional[DictStrAny] = None,
135
+ ) -> Iterable[TDataItems]:
136
+ variables = dict(variables or {})
137
+ while True:
138
+ data = self.run_graphql_query(query, variables)
139
+ print(data)
140
+ data_items = jsonpath.find_values(data_items_path, data)
141
+ if not data_items:
142
+ break
143
+ yield data_items
144
+ cursors = jsonpath.find_values(pagination_cursor_path, data)
145
+ if not cursors:
146
+ break
147
+ variables[pagination_variable_name] = cursors[-1]
@@ -0,0 +1,5 @@
1
+ FIRST_DAY_OF_MILLENNIUM = "2000-01-01"
2
+ DEFAULT_API_VERSION = "2023-10"
3
+ DEFAULT_ITEMS_PER_PAGE = 250
4
+
5
+ DEFAULT_PARTNER_API_VERSION = "2024-01"