omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. omniload/conftest.py +72 -0
  2. omniload/main.py +810 -0
  3. omniload/src/.gitignore +10 -0
  4. omniload/src/adjust/__init__.py +108 -0
  5. omniload/src/adjust/adjust_helpers.py +122 -0
  6. omniload/src/airtable/__init__.py +84 -0
  7. omniload/src/allium/__init__.py +128 -0
  8. omniload/src/anthropic/__init__.py +277 -0
  9. omniload/src/anthropic/helpers.py +525 -0
  10. omniload/src/applovin/__init__.py +316 -0
  11. omniload/src/applovin_max/__init__.py +117 -0
  12. omniload/src/appsflyer/__init__.py +325 -0
  13. omniload/src/appsflyer/client.py +110 -0
  14. omniload/src/appstore/__init__.py +142 -0
  15. omniload/src/appstore/client.py +126 -0
  16. omniload/src/appstore/errors.py +15 -0
  17. omniload/src/appstore/models.py +117 -0
  18. omniload/src/appstore/resources.py +179 -0
  19. omniload/src/arrow/__init__.py +81 -0
  20. omniload/src/asana_source/__init__.py +281 -0
  21. omniload/src/asana_source/helpers.py +30 -0
  22. omniload/src/asana_source/settings.py +158 -0
  23. omniload/src/attio/__init__.py +102 -0
  24. omniload/src/attio/helpers.py +65 -0
  25. omniload/src/blob.py +95 -0
  26. omniload/src/bruin/__init__.py +76 -0
  27. omniload/src/chess/__init__.py +180 -0
  28. omniload/src/chess/helpers.py +35 -0
  29. omniload/src/chess/settings.py +18 -0
  30. omniload/src/clickup/__init__.py +85 -0
  31. omniload/src/clickup/helpers.py +47 -0
  32. omniload/src/collector/spinner.py +43 -0
  33. omniload/src/couchbase_source/__init__.py +118 -0
  34. omniload/src/couchbase_source/helpers.py +135 -0
  35. omniload/src/cursor/__init__.py +83 -0
  36. omniload/src/cursor/helpers.py +188 -0
  37. omniload/src/customer_io/__init__.py +486 -0
  38. omniload/src/customer_io/helpers.py +530 -0
  39. omniload/src/destinations.py +982 -0
  40. omniload/src/docebo/__init__.py +589 -0
  41. omniload/src/docebo/client.py +435 -0
  42. omniload/src/docebo/helpers.py +97 -0
  43. omniload/src/dune/__init__.py +104 -0
  44. omniload/src/dune/helpers.py +108 -0
  45. omniload/src/dynamodb/__init__.py +86 -0
  46. omniload/src/elasticsearch/__init__.py +80 -0
  47. omniload/src/elasticsearch/helpers.py +141 -0
  48. omniload/src/errors.py +26 -0
  49. omniload/src/facebook_ads/__init__.py +403 -0
  50. omniload/src/facebook_ads/exceptions.py +19 -0
  51. omniload/src/facebook_ads/helpers.py +296 -0
  52. omniload/src/facebook_ads/settings.py +224 -0
  53. omniload/src/facebook_ads/utils.py +53 -0
  54. omniload/src/factory.py +305 -0
  55. omniload/src/filesystem/__init__.py +133 -0
  56. omniload/src/filesystem/helpers.py +114 -0
  57. omniload/src/filesystem/readers.py +187 -0
  58. omniload/src/filters.py +62 -0
  59. omniload/src/fireflies/__init__.py +151 -0
  60. omniload/src/fireflies/helpers.py +753 -0
  61. omniload/src/fluxx/__init__.py +10013 -0
  62. omniload/src/fluxx/helpers.py +233 -0
  63. omniload/src/frankfurter/__init__.py +157 -0
  64. omniload/src/frankfurter/helpers.py +48 -0
  65. omniload/src/freshdesk/__init__.py +103 -0
  66. omniload/src/freshdesk/freshdesk_client.py +151 -0
  67. omniload/src/freshdesk/settings.py +23 -0
  68. omniload/src/fundraiseup/__init__.py +95 -0
  69. omniload/src/fundraiseup/client.py +81 -0
  70. omniload/src/github/__init__.py +202 -0
  71. omniload/src/github/helpers.py +207 -0
  72. omniload/src/github/queries.py +129 -0
  73. omniload/src/github/settings.py +24 -0
  74. omniload/src/google_ads/__init__.py +198 -0
  75. omniload/src/google_ads/field.py +17 -0
  76. omniload/src/google_ads/metrics.py +254 -0
  77. omniload/src/google_ads/predicates.py +37 -0
  78. omniload/src/google_ads/reports.py +411 -0
  79. omniload/src/google_ads/test_google_ads.py +184 -0
  80. omniload/src/google_analytics/__init__.py +144 -0
  81. omniload/src/google_analytics/helpers.py +312 -0
  82. omniload/src/google_sheets/README.md +95 -0
  83. omniload/src/google_sheets/__init__.py +166 -0
  84. omniload/src/google_sheets/helpers/__init__.py +15 -0
  85. omniload/src/google_sheets/helpers/api_calls.py +160 -0
  86. omniload/src/google_sheets/helpers/data_processing.py +316 -0
  87. omniload/src/gorgias/__init__.py +595 -0
  88. omniload/src/gorgias/helpers.py +166 -0
  89. omniload/src/hostaway/__init__.py +302 -0
  90. omniload/src/hostaway/client.py +288 -0
  91. omniload/src/http/__init__.py +38 -0
  92. omniload/src/http/readers.py +146 -0
  93. omniload/src/http_client.py +24 -0
  94. omniload/src/hubspot/__init__.py +800 -0
  95. omniload/src/hubspot/helpers.py +417 -0
  96. omniload/src/hubspot/settings.py +329 -0
  97. omniload/src/indeed/__init__.py +153 -0
  98. omniload/src/indeed/helpers.py +228 -0
  99. omniload/src/influxdb/__init__.py +46 -0
  100. omniload/src/influxdb/client.py +34 -0
  101. omniload/src/intercom/__init__.py +142 -0
  102. omniload/src/intercom/helpers.py +674 -0
  103. omniload/src/intercom/settings.py +279 -0
  104. omniload/src/isoc_pulse/__init__.py +159 -0
  105. omniload/src/jira_source/__init__.py +377 -0
  106. omniload/src/jira_source/helpers.py +510 -0
  107. omniload/src/jira_source/settings.py +184 -0
  108. omniload/src/kafka/__init__.py +120 -0
  109. omniload/src/kafka/helpers.py +241 -0
  110. omniload/src/kinesis/__init__.py +153 -0
  111. omniload/src/kinesis/helpers.py +96 -0
  112. omniload/src/klaviyo/__init__.py +237 -0
  113. omniload/src/klaviyo/client.py +212 -0
  114. omniload/src/klaviyo/helpers.py +19 -0
  115. omniload/src/linear/__init__.py +634 -0
  116. omniload/src/linear/helpers.py +111 -0
  117. omniload/src/linkedin_ads/__init__.py +266 -0
  118. omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
  119. omniload/src/linkedin_ads/helpers.py +246 -0
  120. omniload/src/loader.py +69 -0
  121. omniload/src/mailchimp/__init__.py +126 -0
  122. omniload/src/mailchimp/helpers.py +226 -0
  123. omniload/src/mailchimp/settings.py +164 -0
  124. omniload/src/masking.py +344 -0
  125. omniload/src/mixpanel/__init__.py +62 -0
  126. omniload/src/mixpanel/client.py +104 -0
  127. omniload/src/monday/__init__.py +246 -0
  128. omniload/src/monday/helpers.py +392 -0
  129. omniload/src/monday/settings.py +325 -0
  130. omniload/src/mongodb/__init__.py +281 -0
  131. omniload/src/mongodb/helpers.py +975 -0
  132. omniload/src/notion/__init__.py +69 -0
  133. omniload/src/notion/helpers/__init__.py +14 -0
  134. omniload/src/notion/helpers/client.py +178 -0
  135. omniload/src/notion/helpers/database.py +92 -0
  136. omniload/src/notion/settings.py +17 -0
  137. omniload/src/partition.py +32 -0
  138. omniload/src/personio/__init__.py +345 -0
  139. omniload/src/personio/helpers.py +100 -0
  140. omniload/src/phantombuster/__init__.py +65 -0
  141. omniload/src/phantombuster/client.py +87 -0
  142. omniload/src/pinterest/__init__.py +82 -0
  143. omniload/src/pipedrive/__init__.py +212 -0
  144. omniload/src/pipedrive/helpers/__init__.py +37 -0
  145. omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
  146. omniload/src/pipedrive/helpers/pages.py +129 -0
  147. omniload/src/pipedrive/settings.py +41 -0
  148. omniload/src/pipedrive/typing.py +17 -0
  149. omniload/src/plusvibeai/__init__.py +335 -0
  150. omniload/src/plusvibeai/helpers.py +544 -0
  151. omniload/src/plusvibeai/settings.py +252 -0
  152. omniload/src/primer/__init__.py +45 -0
  153. omniload/src/primer/helpers.py +79 -0
  154. omniload/src/quickbooks/__init__.py +117 -0
  155. omniload/src/reddit_ads/__init__.py +183 -0
  156. omniload/src/reddit_ads/helpers.py +232 -0
  157. omniload/src/resource.py +40 -0
  158. omniload/src/revenuecat/__init__.py +83 -0
  159. omniload/src/revenuecat/helpers.py +237 -0
  160. omniload/src/salesforce/__init__.py +170 -0
  161. omniload/src/salesforce/helpers.py +78 -0
  162. omniload/src/shopify/__init__.py +1953 -0
  163. omniload/src/shopify/exceptions.py +17 -0
  164. omniload/src/shopify/helpers.py +202 -0
  165. omniload/src/shopify/settings.py +19 -0
  166. omniload/src/slack/__init__.py +290 -0
  167. omniload/src/slack/helpers.py +218 -0
  168. omniload/src/slack/settings.py +36 -0
  169. omniload/src/smartsheets/__init__.py +82 -0
  170. omniload/src/snapchat_ads/__init__.py +455 -0
  171. omniload/src/snapchat_ads/client.py +72 -0
  172. omniload/src/snapchat_ads/helpers.py +630 -0
  173. omniload/src/snapchat_ads/settings.py +130 -0
  174. omniload/src/socrata_source/__init__.py +83 -0
  175. omniload/src/socrata_source/helpers.py +85 -0
  176. omniload/src/socrata_source/settings.py +8 -0
  177. omniload/src/solidgate/__init__.py +219 -0
  178. omniload/src/solidgate/helpers.py +154 -0
  179. omniload/src/sources.py +5408 -0
  180. omniload/src/sql_database/__init__.py +0 -0
  181. omniload/src/sql_database/callbacks.py +66 -0
  182. omniload/src/stripe_analytics/__init__.py +183 -0
  183. omniload/src/stripe_analytics/helpers.py +386 -0
  184. omniload/src/stripe_analytics/settings.py +80 -0
  185. omniload/src/table_definition.py +15 -0
  186. omniload/src/testdata/fakebqcredentials.json +14 -0
  187. omniload/src/tiktok_ads/__init__.py +150 -0
  188. omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
  189. omniload/src/time.py +11 -0
  190. omniload/src/trustpilot/__init__.py +48 -0
  191. omniload/src/trustpilot/client.py +48 -0
  192. omniload/src/version.py +6 -0
  193. omniload/src/wise/__init__.py +68 -0
  194. omniload/src/wise/client.py +63 -0
  195. omniload/src/zendesk/__init__.py +480 -0
  196. omniload/src/zendesk/helpers/__init__.py +39 -0
  197. omniload/src/zendesk/helpers/api_helpers.py +119 -0
  198. omniload/src/zendesk/helpers/credentials.py +68 -0
  199. omniload/src/zendesk/helpers/talk_api.py +132 -0
  200. omniload/src/zendesk/settings.py +71 -0
  201. omniload/src/zoom/__init__.py +99 -0
  202. omniload/src/zoom/helpers.py +102 -0
  203. omniload/testdata/.gitignore +2 -0
  204. omniload/testdata/create_replace.csv +21 -0
  205. omniload/testdata/delete_insert_expected.csv +6 -0
  206. omniload/testdata/delete_insert_part1.csv +5 -0
  207. omniload/testdata/delete_insert_part2.csv +6 -0
  208. omniload/testdata/merge_expected.csv +5 -0
  209. omniload/testdata/merge_part1.csv +4 -0
  210. omniload/testdata/merge_part2.csv +5 -0
  211. omniload/tests/unit/test_smartsheets.py +133 -0
  212. omniload-0.0.0.dev0.dist-info/METADATA +439 -0
  213. omniload-0.0.0.dev0.dist-info/RECORD +218 -0
  214. omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
  215. omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
  216. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
  217. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
  218. omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
@@ -0,0 +1,158 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Asana source settings and constants"""
16
+
17
+ # Default start date for Asana API requests, only tasks started after this date will be collected
18
+ DEFAULT_START_DATE = "2010-01-01T00:00:00.000Z"
19
+
20
+ # Asana API request timeout
21
+ REQUEST_TIMEOUT = 300
22
+
23
+ # list of workspace fields to be retrieved from Asana API
24
+ WORKSPACE_FIELDS = ("gid", "name", "is_organization", "resource_type", "email_domains")
25
+
26
+ # List of project fields to be retrieved from Asana API
27
+ PROJECT_FIELDS = (
28
+ "name",
29
+ "gid",
30
+ "owner",
31
+ "current_status",
32
+ "custom_fields",
33
+ "default_view",
34
+ "due_date",
35
+ "due_on",
36
+ "is_template",
37
+ "created_at",
38
+ "modified_at",
39
+ "start_on",
40
+ "archived",
41
+ "public",
42
+ "members",
43
+ "followers",
44
+ "color",
45
+ "notes",
46
+ "icon",
47
+ "permalink_url",
48
+ "workspace",
49
+ "team",
50
+ "resource_type",
51
+ "current_status_update",
52
+ "custom_field_settings",
53
+ "completed",
54
+ "completed_at",
55
+ "completed_by",
56
+ "created_from_template",
57
+ "project_brief",
58
+ )
59
+
60
+ # List of section fields to be retrieved from Asana API
61
+ SECTION_FIELDS = (
62
+ "gid",
63
+ "resource_type",
64
+ "name",
65
+ "created_at",
66
+ "project",
67
+ "projects",
68
+ )
69
+
70
+ # List of tag fields to be retrieved from Asana API
71
+ TAG_FIELDS = (
72
+ "gid",
73
+ "resource_type",
74
+ "created_at",
75
+ "followers",
76
+ "name",
77
+ "color",
78
+ "notes",
79
+ "permalink_url",
80
+ "workspace",
81
+ )
82
+
83
+ # List of task fields to be retrieved from Asana API
84
+ TASK_FIELDS = (
85
+ "gid",
86
+ "resource_type",
87
+ "name",
88
+ "approval_status",
89
+ "assignee_status",
90
+ "created_at",
91
+ "assignee",
92
+ "start_on",
93
+ "start_at",
94
+ "due_on",
95
+ "due_at",
96
+ "completed",
97
+ "completed_at",
98
+ "completed_by",
99
+ "modified_at",
100
+ "dependencies",
101
+ "dependents",
102
+ "external",
103
+ "notes",
104
+ "num_subtasks",
105
+ "resource_subtype",
106
+ "followers",
107
+ "parent",
108
+ "permalink_url",
109
+ "tags",
110
+ "workspace",
111
+ "custom_fields",
112
+ "project",
113
+ "memberships",
114
+ "memberships.project.name",
115
+ "memberships.section.name",
116
+ )
117
+
118
+ # List of story fields to be retrieved from Asana API
119
+ STORY_FIELDS = (
120
+ "gid",
121
+ "resource_type",
122
+ "created_at",
123
+ "created_by",
124
+ "resource_subtype",
125
+ "text",
126
+ "is_pinned",
127
+ "assignee",
128
+ "dependency",
129
+ "follower",
130
+ "new_section",
131
+ "old_section",
132
+ "new_text_value",
133
+ "old_text_value",
134
+ "preview",
135
+ "project",
136
+ "source",
137
+ "story",
138
+ "tag",
139
+ "target",
140
+ "task",
141
+ "sticker_name",
142
+ "custom_field",
143
+ "type",
144
+ )
145
+
146
+ # List of team fields to be retrieved from Asana API
147
+ TEAMS_FIELD = (
148
+ "gid",
149
+ "resource_type",
150
+ "name",
151
+ "description",
152
+ "organization",
153
+ "permalink_url",
154
+ "visibility",
155
+ )
156
+
157
+ # List of user fields to be retrieved from Asana API
158
+ USER_FIELDS = ("gid", "resource_type", "name", "email", "photo", "workspaces")
@@ -0,0 +1,102 @@
1
+ from typing import Iterable, Iterator
2
+
3
+ import dlt
4
+ from dlt.sources import DltResource
5
+
6
+ from .helpers import AttioClient
7
+
8
+
9
+ @dlt.source(max_table_nesting=0)
10
+ def attio_source(
11
+ api_key: str,
12
+ params: list[str],
13
+ ) -> Iterable[DltResource]:
14
+ attio_client = AttioClient(api_key)
15
+
16
+ @dlt.resource(
17
+ name="objects",
18
+ write_disposition="replace",
19
+ columns={
20
+ "created_at": {"data_type": "timestamp", "partition": True},
21
+ },
22
+ )
23
+ # https://docs.attio.com/rest-api/endpoint-reference/objects/list-objects - does not support pagination
24
+ def fetch_objects() -> Iterator[dict]:
25
+ if len(params) != 0:
26
+ raise ValueError("Objects table must be in the format `objects`")
27
+
28
+ path = "objects"
29
+ yield attio_client.fetch_all(path, "get")
30
+
31
+ # https://docs.attio.com/rest-api/endpoint-reference/records/list-records
32
+ @dlt.resource(
33
+ name="records",
34
+ write_disposition="replace",
35
+ columns={
36
+ "created_at": {"data_type": "timestamp", "partition": True},
37
+ },
38
+ )
39
+ def fetch_records() -> Iterator[dict]:
40
+ if len(params) != 1:
41
+ raise ValueError(
42
+ "Records table must be in the format `records:{object_api_slug}`"
43
+ )
44
+ object_id = params[0]
45
+ path = f"objects/{object_id}/records/query"
46
+
47
+ yield attio_client.fetch_paginated(path, "post")
48
+
49
+ # https://docs.attio.com/rest-api/endpoint-reference/lists/list-all-lists -- does not support pagination
50
+ @dlt.resource(
51
+ name="lists",
52
+ write_disposition="replace",
53
+ columns={
54
+ "created_at": {"data_type": "timestamp", "partition": True},
55
+ },
56
+ )
57
+ def fetch_lists() -> Iterator[dict]:
58
+ path = "lists"
59
+ yield attio_client.fetch_all(path, "get")
60
+
61
+ # https://docs.attio.com/rest-api/endpoint-reference/entries/list-entries
62
+ @dlt.resource(
63
+ name="list_entries",
64
+ write_disposition="replace",
65
+ columns={
66
+ "created_at": {"data_type": "timestamp", "partition": True},
67
+ },
68
+ )
69
+ def fetch_list_entries() -> Iterator[dict]:
70
+ if len(params) != 1:
71
+ raise ValueError(
72
+ "List entries table must be in the format `list_entries:{list_id}`"
73
+ )
74
+ path = f"lists/{params[0]}/entries/query"
75
+
76
+ yield attio_client.fetch_paginated(path, "post")
77
+
78
+ @dlt.resource(
79
+ name="all_list_entries",
80
+ write_disposition="replace",
81
+ columns={
82
+ "created_at": {"data_type": "timestamp", "partition": True},
83
+ },
84
+ )
85
+ def fetch_all_list_entries() -> Iterator[dict]:
86
+ if len(params) != 1:
87
+ raise ValueError(
88
+ "All list entries table must be in the format `all_list_entries:{object_api_slug}`"
89
+ )
90
+ path = "lists"
91
+ for lst in attio_client.fetch_all(path, "get"):
92
+ if params[0] in lst["parent_object"]:
93
+ path = f"lists/{lst['id']['list_id']}/entries/query"
94
+ yield from attio_client.fetch_paginated(path, "post")
95
+
96
+ return (
97
+ fetch_objects,
98
+ fetch_records,
99
+ fetch_lists,
100
+ fetch_list_entries,
101
+ fetch_all_list_entries,
102
+ )
@@ -0,0 +1,65 @@
1
+ from omniload.src.http_client import create_client
2
+
3
+
4
+ class AttioClient:
5
+ def __init__(self, api_key: str):
6
+ self.base_url = "https://api.attio.com/v2"
7
+ self.headers = {
8
+ "Accept": "application/json",
9
+ "Authorization": f"Bearer {api_key}",
10
+ }
11
+ self.client = create_client()
12
+
13
+ def fetch_paginated(self, path: str, method: str, limit: int = 1000, params=None):
14
+ url = f"{self.base_url}/{path}"
15
+ if params is None:
16
+ params = {}
17
+ offset = 0
18
+ while True:
19
+ query_params = {"limit": limit, "offset": offset, **params}
20
+ if method == "get":
21
+ response = self.client.get(
22
+ url, headers=self.headers, params=query_params
23
+ )
24
+ else:
25
+ json_body = {**params, "limit": limit, "offset": offset}
26
+ response = self.client.post(url, headers=self.headers, json=json_body)
27
+
28
+ if response.status_code != 200:
29
+ raise Exception(f"HTTP {response.status_code} error: {response.text}")
30
+
31
+ response_data = response.json()
32
+ if "data" not in response_data:
33
+ raise Exception(
34
+ "Attio API returned a response without the expected data"
35
+ )
36
+
37
+ data = response_data["data"]
38
+ for item in data:
39
+ flat_item = flatten_item(item)
40
+ yield flat_item
41
+ if len(data) < limit:
42
+ break
43
+
44
+ offset += limit
45
+
46
+ def fetch_all(self, path: str, method: str = "get", params=None):
47
+ url = f"{self.base_url}/{path}"
48
+ params = params or {}
49
+
50
+ if method == "get":
51
+ response = self.client.get(url, headers=self.headers, params=params)
52
+ else:
53
+ response = self.client.post(url, headers=self.headers, json=params)
54
+
55
+ response.raise_for_status()
56
+ data = response.json().get("data", [])
57
+ for item in data:
58
+ yield flatten_item(item)
59
+
60
+
61
+ def flatten_item(item: dict) -> dict:
62
+ if "id" in item:
63
+ for key, value in item["id"].items():
64
+ item[key] = value
65
+ return item
omniload/src/blob.py ADDED
@@ -0,0 +1,95 @@
1
+ import warnings
2
+ from typing import Tuple, TypeAlias
3
+ from urllib.parse import ParseResult, urlparse
4
+
5
+ BucketName: TypeAlias = str
6
+ FileGlob: TypeAlias = str
7
+
8
+
9
+ class UnsupportedEndpointError(Exception):
10
+ pass
11
+
12
+
13
+ def parse_uri(uri: ParseResult, table: str) -> Tuple[BucketName, FileGlob]:
14
+ """
15
+ parse the URI of a blob storage and
16
+ return the bucket name and the file glob.
17
+
18
+ Supports the following Forms:
19
+ - uri: "gs://"
20
+ table: "bucket-name/file-glob"
21
+ - uri: "gs://uri-bucket-name" (uri-bucket-name is preferred)
22
+ table: "gs://table-bucket-name/file-glob"
23
+ - uri: "gs://"
24
+ table: "gs://bucket-name/file-glob"
25
+ - uri: gs://bucket-name/file-glob
26
+ table: None
27
+ - uri: "gs://bucket-name"
28
+ table: "file-glob"
29
+
30
+ The first form is the prefered method. Other forms are supported but discouraged.
31
+ """
32
+
33
+ table = table.strip()
34
+ host = uri.netloc.strip()
35
+
36
+ if table == "" or uri.path.strip() != "":
37
+ warnings.warn(
38
+ f"Using the form '{uri.scheme}://bucket-name/file-glob' is deprecated and will be removed in future versions.",
39
+ DeprecationWarning,
40
+ stacklevel=2,
41
+ )
42
+ return host, uri.path.lstrip("/")
43
+
44
+ table_uri = urlparse(table)
45
+
46
+ if host != "":
47
+ return host, table_uri.path.lstrip("/")
48
+
49
+ if table_uri.hostname:
50
+ return table_uri.hostname, table_uri.path.lstrip("/")
51
+
52
+ parts = table_uri.path.lstrip("/").split("/", maxsplit=1)
53
+ if len(parts) != 2:
54
+ return "", parts[0]
55
+
56
+ return parts[0], parts[1]
57
+
58
+
59
+ def parse_endpoint(path: str) -> str:
60
+ """
61
+ Parse the endpoint kind from the URI.
62
+
63
+ kind is a file format. one of [csv, jsonl, parquet]
64
+ """
65
+ file_extension = path.split(".")[-1]
66
+ if file_extension == "gz":
67
+ file_extension = path.split(".")[-2]
68
+ if file_extension == "csv":
69
+ endpoint = "read_csv"
70
+ elif file_extension == "jsonl":
71
+ endpoint = "read_jsonl"
72
+ elif file_extension == "parquet":
73
+ endpoint = "read_parquet"
74
+ else:
75
+ raise UnsupportedEndpointError(f"Unsupported file format: {file_extension}")
76
+ return endpoint
77
+
78
+
79
+ def determine_endpoint(table: str, path: str) -> str:
80
+ """
81
+ determines the endpoint/method to use for reading data from a blob source
82
+ """
83
+
84
+ if "#" in table:
85
+ _, endpoint = table.split("#")
86
+ if endpoint not in ["csv", "csv_headless", "jsonl", "parquet"]:
87
+ raise UnsupportedEndpointError(f"Unsupported file format: {endpoint}")
88
+ endpoint = f"read_{endpoint}"
89
+ else:
90
+ try:
91
+ endpoint = parse_endpoint(path)
92
+ except Exception as e:
93
+ raise ValueError(f"Failed to parse endpoint from path: {path}") from e
94
+
95
+ return endpoint
@@ -0,0 +1,76 @@
1
+ """Bruin source for fetching pipeline and asset data from Bruin Cloud API"""
2
+
3
+ from typing import Iterator
4
+
5
+ import dlt
6
+ from dlt.sources.helpers import requests
7
+
8
+ BASE_URL = "https://cloud.getbruin.com/api/v1"
9
+
10
+
11
+ def _fetch_pipelines(headers: dict) -> list:
12
+ """Fetch pipelines data from API."""
13
+ response = requests.get(f"{BASE_URL}/pipelines", headers=headers)
14
+ response.raise_for_status()
15
+ return response.json()
16
+
17
+
18
+ @dlt.source(name="bruin", max_table_nesting=0)
19
+ def bruin_source(api_token: str):
20
+ """
21
+ A dlt source for the Bruin Cloud API.
22
+
23
+ Args:
24
+ api_token (str): The API token for authentication.
25
+
26
+ Returns:
27
+ DltResource: Resources for pipelines and assets data.
28
+ """
29
+ headers = {"Authorization": f"Bearer {api_token}"}
30
+
31
+ @dlt.resource(write_disposition="replace")
32
+ def pipelines() -> Iterator[dict]:
33
+ """
34
+ Fetches all pipelines and yields pipeline_id and pipeline_name for each.
35
+ """
36
+ data = _fetch_pipelines(headers)
37
+
38
+ for pipeline in data:
39
+ yield {
40
+ "name": pipeline.get("name"),
41
+ "description": pipeline.get("description"),
42
+ "project": pipeline.get("project"),
43
+ "owner": pipeline.get("owner"),
44
+ "default_connections": pipeline.get("default_connections"),
45
+ "schedule": pipeline.get("schedule"),
46
+ "commit": pipeline.get("commit"),
47
+ "start_date": pipeline.get("start_date"),
48
+ }
49
+
50
+ @dlt.resource(write_disposition="replace")
51
+ def assets() -> Iterator[dict]:
52
+ """
53
+ Fetches all assets from all pipelines (same endpoint as pipelines).
54
+ """
55
+ data = _fetch_pipelines(headers)
56
+
57
+ for pipeline in data:
58
+ pipeline_assets = pipeline.get("assets", [])
59
+ for asset in pipeline_assets:
60
+ yield {
61
+ "name": asset.get("name"),
62
+ "type": asset.get("type"),
63
+ "pipeline": asset.get("pipeline"),
64
+ "project": asset.get("project"),
65
+ "uri": asset.get("uri"),
66
+ "description": asset.get("description"),
67
+ "upstreams": asset.get("upstreams"),
68
+ "downstream": asset.get("downstream"),
69
+ "owner": asset.get("owner"),
70
+ "content": asset.get("content"),
71
+ "columns": asset.get("columns"),
72
+ "materialization": asset.get("materialization"),
73
+ "parameters": asset.get("parameters"),
74
+ }
75
+
76
+ return pipelines, assets
@@ -0,0 +1,180 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """A source loading player profiles and games from chess.com api"""
16
+
17
+ from typing import Any, Callable, Dict, Iterator, List, Sequence
18
+
19
+ import dlt
20
+ from dlt.common import pendulum
21
+ from dlt.common.typing import TDataItem
22
+ from dlt.sources import DltResource
23
+ from dlt.sources.helpers import requests
24
+
25
+ from .helpers import get_path_with_retry, get_url_with_retry, validate_month_string
26
+ from .settings import UNOFFICIAL_CHESS_API_URL
27
+
28
+
29
+ @dlt.source(name="chess", max_table_nesting=0)
30
+ def source(
31
+ players: List[str], start_month: str = None, end_month: str = None
32
+ ) -> Sequence[DltResource]:
33
+ """
34
+ A dlt source for the chess.com api. It groups several resources (in this case chess.com API endpoints) containing
35
+ various types of data: user profiles or chess match results
36
+ Args:
37
+ players (List[str]): A list of the player usernames for which to get the data.
38
+ start_month (str, optional): Filters out all the matches happening before `start_month`. Defaults to None.
39
+ end_month (str, optional): Filters out all the matches happening after `end_month`. Defaults to None.
40
+ Returns:
41
+ Sequence[DltResource]: A sequence of resources that can be selected from including players_profiles,
42
+ players_archives, players_games, players_online_status
43
+ """
44
+ return (
45
+ players_profiles(players),
46
+ players_archives(players),
47
+ players_games(players, start_month=start_month, end_month=end_month),
48
+ players_online_status(players),
49
+ )
50
+
51
+
52
+ @dlt.resource(
53
+ write_disposition="replace",
54
+ columns={
55
+ "last_online": {"data_type": "timestamp"},
56
+ "joined": {"data_type": "timestamp"},
57
+ },
58
+ )
59
+ def players_profiles(players: List[str]) -> Iterator[TDataItem]:
60
+ """
61
+ Yields player profiles for a list of player usernames.
62
+ Args:
63
+ players (List[str]): List of player usernames to retrieve profiles for.
64
+ Yields:
65
+ Iterator[TDataItem]: An iterator over player profiles data.
66
+ """
67
+
68
+ # get archives in parallel by decorating the http request with defer
69
+ @dlt.defer
70
+ def _get_profile(username: str) -> TDataItem:
71
+ return get_path_with_retry(f"player/{username}")
72
+
73
+ for username in players:
74
+ yield _get_profile(username)
75
+
76
+
77
+ @dlt.resource(write_disposition="replace", selected=False)
78
+ def players_archives(players: List[str]) -> Iterator[List[TDataItem]]:
79
+ """
80
+ Yields url to game archives for specified players.
81
+ Args:
82
+ players (List[str]): List of player usernames to retrieve archives for.
83
+ Yields:
84
+ Iterator[List[TDataItem]]: An iterator over list of player archive data.
85
+ """
86
+ for username in players:
87
+ data = get_path_with_retry(f"player/{username}/games/archives")
88
+ yield data.get("archives", [])
89
+
90
+
91
+ @dlt.resource(
92
+ write_disposition="replace", columns={"end_time": {"data_type": "timestamp"}}
93
+ )
94
+ def players_games(
95
+ players: List[str], start_month: str = None, end_month: str = None
96
+ ) -> Iterator[Callable[[], List[TDataItem]]]:
97
+ """
98
+ Yields `players` games that happened between `start_month` and `end_month`.
99
+ Args:
100
+ players (List[str]): List of player usernames to retrieve games for.
101
+ start_month (str, optional): The starting month in the format "YYYY/MM". Defaults to None.
102
+ end_month (str, optional): The ending month in the format "YYYY/MM". Defaults to None.
103
+ Yields:
104
+ Iterator[Callable[[], List[TDataItem]]]: An iterator over callables that return a list of games for each player.
105
+ """ # do a simple validation to prevent common mistakes in month format
106
+ validate_month_string(start_month)
107
+ validate_month_string(end_month)
108
+
109
+ # get a list of already checked archives
110
+ # from your point of view, the state is python dictionary that will have the same content the next time this function is called
111
+ checked_archives = dlt.current.resource_state().setdefault("archives", [])
112
+ # get player archives, note that you can call the resource like any other function and just iterate it like a list
113
+ archives = players_archives(players)
114
+
115
+ # get archives in parallel by decorating the http request with defer
116
+ @dlt.defer
117
+ def _get_archive(url: str) -> List[TDataItem]:
118
+ try:
119
+ games = get_url_with_retry(url).get("games", [])
120
+ return games # type: ignore
121
+ except requests.HTTPError as http_err:
122
+ # sometimes archives are not available and the error seems to be permanent
123
+ if http_err.response.status_code == 404:
124
+ return []
125
+ raise
126
+
127
+ # enumerate the archives
128
+ for url in archives:
129
+ # the `url` format is https://api.chess.com/pub/player/{username}/games/{YYYY}/{MM}
130
+ if start_month and url[-7:] < start_month:
131
+ continue
132
+ if end_month and url[-7:] > end_month:
133
+ continue
134
+ # do not download archive again
135
+ if url in checked_archives:
136
+ continue
137
+ checked_archives.append(url)
138
+ # get the filtered archive
139
+ yield _get_archive(url)
140
+
141
+
142
+ @dlt.resource(write_disposition="append")
143
+ def players_online_status(players: List[str]) -> Iterator[TDataItem]:
144
+ """
145
+ Returns current online status for a list of players.
146
+ Args:
147
+ players (List[str]): List of player usernames to check online status for.
148
+ Yields:
149
+ Iterator[TDataItem]: An iterator over the online status of each player.
150
+ """
151
+ # we'll use unofficial endpoint to get online status, the official seems to be removed
152
+ for player in players:
153
+ status = get_url_with_retry(f"{UNOFFICIAL_CHESS_API_URL}user/popup/{player}")
154
+ # return just relevant selection
155
+ yield {
156
+ "username": player,
157
+ "onlineStatus": status["onlineStatus"],
158
+ "lastLoginDate": status["lastLoginDate"],
159
+ "check_time": pendulum.now(), # dlt can deal with native python dates
160
+ }
161
+
162
+
163
+ @dlt.source
164
+ def chess_dlt_config_example(
165
+ secret_str: str = dlt.secrets.value,
166
+ secret_dict: Dict[str, Any] = dlt.secrets.value,
167
+ config_int: int = dlt.config.value,
168
+ ) -> DltResource:
169
+ """
170
+ An example of a source that uses dlt to provide secrets and config values.
171
+ Args:
172
+ secret_str (str, optional): Secret string provided by dlt.secrets.value. Defaults to dlt.secrets.value.
173
+ secret_dict (Dict[str, Any], optional): Secret dictionary provided by dlt.secrets.value. Defaults to dlt.secrets.value.
174
+ config_int (int, optional): Config integer provided by dlt.config.value. Defaults to dlt.config.value.
175
+ Returns:
176
+ DltResource: Returns a resource yielding the configured values.
177
+ """
178
+
179
+ # returns a resource yielding the configured values - it is just a test
180
+ return dlt.resource([secret_str, secret_dict, config_int], name="config_values")