omniload 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omniload/conftest.py +72 -0
- omniload/main.py +810 -0
- omniload/src/.gitignore +10 -0
- omniload/src/adjust/__init__.py +108 -0
- omniload/src/adjust/adjust_helpers.py +122 -0
- omniload/src/airtable/__init__.py +84 -0
- omniload/src/allium/__init__.py +128 -0
- omniload/src/anthropic/__init__.py +277 -0
- omniload/src/anthropic/helpers.py +525 -0
- omniload/src/applovin/__init__.py +316 -0
- omniload/src/applovin_max/__init__.py +117 -0
- omniload/src/appsflyer/__init__.py +325 -0
- omniload/src/appsflyer/client.py +110 -0
- omniload/src/appstore/__init__.py +142 -0
- omniload/src/appstore/client.py +126 -0
- omniload/src/appstore/errors.py +15 -0
- omniload/src/appstore/models.py +117 -0
- omniload/src/appstore/resources.py +179 -0
- omniload/src/arrow/__init__.py +81 -0
- omniload/src/asana_source/__init__.py +281 -0
- omniload/src/asana_source/helpers.py +30 -0
- omniload/src/asana_source/settings.py +158 -0
- omniload/src/attio/__init__.py +102 -0
- omniload/src/attio/helpers.py +65 -0
- omniload/src/blob.py +95 -0
- omniload/src/bruin/__init__.py +76 -0
- omniload/src/chess/__init__.py +180 -0
- omniload/src/chess/helpers.py +35 -0
- omniload/src/chess/settings.py +18 -0
- omniload/src/clickup/__init__.py +85 -0
- omniload/src/clickup/helpers.py +47 -0
- omniload/src/collector/spinner.py +43 -0
- omniload/src/couchbase_source/__init__.py +118 -0
- omniload/src/couchbase_source/helpers.py +135 -0
- omniload/src/cursor/__init__.py +83 -0
- omniload/src/cursor/helpers.py +188 -0
- omniload/src/customer_io/__init__.py +486 -0
- omniload/src/customer_io/helpers.py +530 -0
- omniload/src/destinations.py +982 -0
- omniload/src/docebo/__init__.py +589 -0
- omniload/src/docebo/client.py +435 -0
- omniload/src/docebo/helpers.py +97 -0
- omniload/src/dune/__init__.py +104 -0
- omniload/src/dune/helpers.py +108 -0
- omniload/src/dynamodb/__init__.py +86 -0
- omniload/src/elasticsearch/__init__.py +80 -0
- omniload/src/elasticsearch/helpers.py +141 -0
- omniload/src/errors.py +26 -0
- omniload/src/facebook_ads/__init__.py +403 -0
- omniload/src/facebook_ads/exceptions.py +19 -0
- omniload/src/facebook_ads/helpers.py +296 -0
- omniload/src/facebook_ads/settings.py +224 -0
- omniload/src/facebook_ads/utils.py +53 -0
- omniload/src/factory.py +305 -0
- omniload/src/filesystem/__init__.py +133 -0
- omniload/src/filesystem/helpers.py +114 -0
- omniload/src/filesystem/readers.py +187 -0
- omniload/src/filters.py +62 -0
- omniload/src/fireflies/__init__.py +151 -0
- omniload/src/fireflies/helpers.py +753 -0
- omniload/src/fluxx/__init__.py +10013 -0
- omniload/src/fluxx/helpers.py +233 -0
- omniload/src/frankfurter/__init__.py +157 -0
- omniload/src/frankfurter/helpers.py +48 -0
- omniload/src/freshdesk/__init__.py +103 -0
- omniload/src/freshdesk/freshdesk_client.py +151 -0
- omniload/src/freshdesk/settings.py +23 -0
- omniload/src/fundraiseup/__init__.py +95 -0
- omniload/src/fundraiseup/client.py +81 -0
- omniload/src/github/__init__.py +202 -0
- omniload/src/github/helpers.py +207 -0
- omniload/src/github/queries.py +129 -0
- omniload/src/github/settings.py +24 -0
- omniload/src/google_ads/__init__.py +198 -0
- omniload/src/google_ads/field.py +17 -0
- omniload/src/google_ads/metrics.py +254 -0
- omniload/src/google_ads/predicates.py +37 -0
- omniload/src/google_ads/reports.py +411 -0
- omniload/src/google_ads/test_google_ads.py +184 -0
- omniload/src/google_analytics/__init__.py +144 -0
- omniload/src/google_analytics/helpers.py +312 -0
- omniload/src/google_sheets/README.md +95 -0
- omniload/src/google_sheets/__init__.py +166 -0
- omniload/src/google_sheets/helpers/__init__.py +15 -0
- omniload/src/google_sheets/helpers/api_calls.py +160 -0
- omniload/src/google_sheets/helpers/data_processing.py +316 -0
- omniload/src/gorgias/__init__.py +595 -0
- omniload/src/gorgias/helpers.py +166 -0
- omniload/src/hostaway/__init__.py +302 -0
- omniload/src/hostaway/client.py +288 -0
- omniload/src/http/__init__.py +38 -0
- omniload/src/http/readers.py +146 -0
- omniload/src/http_client.py +24 -0
- omniload/src/hubspot/__init__.py +800 -0
- omniload/src/hubspot/helpers.py +417 -0
- omniload/src/hubspot/settings.py +329 -0
- omniload/src/indeed/__init__.py +153 -0
- omniload/src/indeed/helpers.py +228 -0
- omniload/src/influxdb/__init__.py +46 -0
- omniload/src/influxdb/client.py +34 -0
- omniload/src/intercom/__init__.py +142 -0
- omniload/src/intercom/helpers.py +674 -0
- omniload/src/intercom/settings.py +279 -0
- omniload/src/isoc_pulse/__init__.py +159 -0
- omniload/src/jira_source/__init__.py +377 -0
- omniload/src/jira_source/helpers.py +510 -0
- omniload/src/jira_source/settings.py +184 -0
- omniload/src/kafka/__init__.py +120 -0
- omniload/src/kafka/helpers.py +241 -0
- omniload/src/kinesis/__init__.py +153 -0
- omniload/src/kinesis/helpers.py +96 -0
- omniload/src/klaviyo/__init__.py +237 -0
- omniload/src/klaviyo/client.py +212 -0
- omniload/src/klaviyo/helpers.py +19 -0
- omniload/src/linear/__init__.py +634 -0
- omniload/src/linear/helpers.py +111 -0
- omniload/src/linkedin_ads/__init__.py +266 -0
- omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
- omniload/src/linkedin_ads/helpers.py +246 -0
- omniload/src/loader.py +69 -0
- omniload/src/mailchimp/__init__.py +126 -0
- omniload/src/mailchimp/helpers.py +226 -0
- omniload/src/mailchimp/settings.py +164 -0
- omniload/src/masking.py +344 -0
- omniload/src/mixpanel/__init__.py +62 -0
- omniload/src/mixpanel/client.py +104 -0
- omniload/src/monday/__init__.py +246 -0
- omniload/src/monday/helpers.py +392 -0
- omniload/src/monday/settings.py +325 -0
- omniload/src/mongodb/__init__.py +281 -0
- omniload/src/mongodb/helpers.py +975 -0
- omniload/src/notion/__init__.py +69 -0
- omniload/src/notion/helpers/__init__.py +14 -0
- omniload/src/notion/helpers/client.py +178 -0
- omniload/src/notion/helpers/database.py +92 -0
- omniload/src/notion/settings.py +17 -0
- omniload/src/partition.py +32 -0
- omniload/src/personio/__init__.py +345 -0
- omniload/src/personio/helpers.py +100 -0
- omniload/src/phantombuster/__init__.py +65 -0
- omniload/src/phantombuster/client.py +87 -0
- omniload/src/pinterest/__init__.py +82 -0
- omniload/src/pipedrive/__init__.py +212 -0
- omniload/src/pipedrive/helpers/__init__.py +37 -0
- omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
- omniload/src/pipedrive/helpers/pages.py +129 -0
- omniload/src/pipedrive/settings.py +41 -0
- omniload/src/pipedrive/typing.py +17 -0
- omniload/src/plusvibeai/__init__.py +335 -0
- omniload/src/plusvibeai/helpers.py +544 -0
- omniload/src/plusvibeai/settings.py +252 -0
- omniload/src/primer/__init__.py +45 -0
- omniload/src/primer/helpers.py +79 -0
- omniload/src/quickbooks/__init__.py +117 -0
- omniload/src/reddit_ads/__init__.py +183 -0
- omniload/src/reddit_ads/helpers.py +232 -0
- omniload/src/resource.py +40 -0
- omniload/src/revenuecat/__init__.py +83 -0
- omniload/src/revenuecat/helpers.py +237 -0
- omniload/src/salesforce/__init__.py +170 -0
- omniload/src/salesforce/helpers.py +78 -0
- omniload/src/shopify/__init__.py +1953 -0
- omniload/src/shopify/exceptions.py +17 -0
- omniload/src/shopify/helpers.py +202 -0
- omniload/src/shopify/settings.py +19 -0
- omniload/src/slack/__init__.py +290 -0
- omniload/src/slack/helpers.py +218 -0
- omniload/src/slack/settings.py +36 -0
- omniload/src/smartsheets/__init__.py +82 -0
- omniload/src/snapchat_ads/__init__.py +455 -0
- omniload/src/snapchat_ads/client.py +72 -0
- omniload/src/snapchat_ads/helpers.py +630 -0
- omniload/src/snapchat_ads/settings.py +130 -0
- omniload/src/socrata_source/__init__.py +83 -0
- omniload/src/socrata_source/helpers.py +85 -0
- omniload/src/socrata_source/settings.py +8 -0
- omniload/src/solidgate/__init__.py +219 -0
- omniload/src/solidgate/helpers.py +154 -0
- omniload/src/sources.py +5408 -0
- omniload/src/sql_database/__init__.py +0 -0
- omniload/src/sql_database/callbacks.py +66 -0
- omniload/src/stripe_analytics/__init__.py +183 -0
- omniload/src/stripe_analytics/helpers.py +386 -0
- omniload/src/stripe_analytics/settings.py +80 -0
- omniload/src/table_definition.py +15 -0
- omniload/src/testdata/fakebqcredentials.json +14 -0
- omniload/src/tiktok_ads/__init__.py +150 -0
- omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
- omniload/src/time.py +11 -0
- omniload/src/trustpilot/__init__.py +48 -0
- omniload/src/trustpilot/client.py +48 -0
- omniload/src/version.py +6 -0
- omniload/src/wise/__init__.py +68 -0
- omniload/src/wise/client.py +63 -0
- omniload/src/zendesk/__init__.py +480 -0
- omniload/src/zendesk/helpers/__init__.py +39 -0
- omniload/src/zendesk/helpers/api_helpers.py +119 -0
- omniload/src/zendesk/helpers/credentials.py +68 -0
- omniload/src/zendesk/helpers/talk_api.py +132 -0
- omniload/src/zendesk/settings.py +71 -0
- omniload/src/zoom/__init__.py +99 -0
- omniload/src/zoom/helpers.py +102 -0
- omniload/testdata/.gitignore +2 -0
- omniload/testdata/create_replace.csv +21 -0
- omniload/testdata/delete_insert_expected.csv +6 -0
- omniload/testdata/delete_insert_part1.csv +5 -0
- omniload/testdata/delete_insert_part2.csv +6 -0
- omniload/testdata/merge_expected.csv +5 -0
- omniload/testdata/merge_part1.csv +4 -0
- omniload/testdata/merge_part2.csv +5 -0
- omniload/tests/unit/test_smartsheets.py +133 -0
- omniload-0.0.0.dev0.dist-info/METADATA +439 -0
- omniload-0.0.0.dev0.dist-info/RECORD +218 -0
- omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
- omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
- omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Copyright 2022-2025 ScaleVector
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
This module defines default settings for the Freshdesk integration.
|
|
17
|
+
|
|
18
|
+
It specifies a list of default endpoints to be used when interacting with the Freshdesk API,
|
|
19
|
+
covering common entities such as agents, companies, contacts, groups, roles, and tickets.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# Define default endpoints for the Freshdesk API integration.
|
|
23
|
+
DEFAULT_ENDPOINTS = ["agents", "companies", "contacts", "groups", "roles", "tickets"]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Fundraiseup source for ingesting donations, events, fundraisers, recurring plans, and supporters."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Generator, Iterable, TypedDict
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
import pendulum
|
|
7
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
8
|
+
from dlt.sources import DltResource
|
|
9
|
+
|
|
10
|
+
from .client import FundraiseupClient
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DonationCursor(TypedDict):
|
|
14
|
+
id: str
|
|
15
|
+
created_at: pendulum.DateTime
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def order_by_created(record) -> DonationCursor:
|
|
19
|
+
last_value = None
|
|
20
|
+
if len(record) == 1:
|
|
21
|
+
(record,) = record
|
|
22
|
+
else:
|
|
23
|
+
record, last_value = record
|
|
24
|
+
|
|
25
|
+
cursor: DonationCursor = {
|
|
26
|
+
"id": record["id"],
|
|
27
|
+
"created_at": ensure_pendulum_datetime(record["created_at"]),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if last_value is None:
|
|
31
|
+
return cursor
|
|
32
|
+
|
|
33
|
+
return max(cursor, last_value, key=lambda v: v["created_at"])
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dlt.source(name="fundraiseup", max_table_nesting=0)
|
|
37
|
+
def fundraiseup_source(api_key: str) -> Iterable[DltResource]:
|
|
38
|
+
"""
|
|
39
|
+
Return resources for Fundraiseup API.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
api_key: API key for authentication
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Iterable of DLT resources
|
|
46
|
+
"""
|
|
47
|
+
client = FundraiseupClient(api_key=api_key)
|
|
48
|
+
|
|
49
|
+
# Define available resources and their configurations
|
|
50
|
+
resources = {
|
|
51
|
+
"donations": {"write_disposition": "replace", "primary_key": "id"},
|
|
52
|
+
"events": {"write_disposition": "replace", "primary_key": "id"},
|
|
53
|
+
"fundraisers": {"write_disposition": "replace", "primary_key": "id"},
|
|
54
|
+
"recurring_plans": {"write_disposition": "replace", "primary_key": "id"},
|
|
55
|
+
"supporters": {"write_disposition": "replace", "primary_key": "id"},
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
def create_resource(resource_name: str, config: Dict[str, Any]) -> DltResource:
|
|
59
|
+
"""Create a DLT resource dynamically."""
|
|
60
|
+
|
|
61
|
+
@dlt.resource(
|
|
62
|
+
name=resource_name,
|
|
63
|
+
write_disposition=config["write_disposition"],
|
|
64
|
+
primary_key=config["primary_key"],
|
|
65
|
+
)
|
|
66
|
+
def generic_resource() -> Generator[Dict[str, Any], None, None]:
|
|
67
|
+
"""Generic resource that yields batches directly."""
|
|
68
|
+
for batch in client.get_paginated_data(resource_name):
|
|
69
|
+
yield batch # type: ignore[misc]
|
|
70
|
+
|
|
71
|
+
return generic_resource()
|
|
72
|
+
|
|
73
|
+
@dlt.resource(
|
|
74
|
+
name="donations:incremental",
|
|
75
|
+
write_disposition="merge",
|
|
76
|
+
primary_key="id",
|
|
77
|
+
)
|
|
78
|
+
def donations_incremental(
|
|
79
|
+
last_record: dlt.sources.incremental[DonationCursor] = dlt.sources.incremental(
|
|
80
|
+
"$",
|
|
81
|
+
range_start="closed",
|
|
82
|
+
range_end="closed",
|
|
83
|
+
last_value_func=order_by_created,
|
|
84
|
+
),
|
|
85
|
+
):
|
|
86
|
+
params = {}
|
|
87
|
+
if last_record.last_value is not None:
|
|
88
|
+
params["starting_after"] = last_record.last_value["id"]
|
|
89
|
+
for batch in client.get_paginated_data("donations", params=params):
|
|
90
|
+
yield batch # type: ignore[misc]
|
|
91
|
+
|
|
92
|
+
# Return all resources
|
|
93
|
+
return [donations_incremental] + [
|
|
94
|
+
create_resource(name, config) for name, config in resources.items()
|
|
95
|
+
]
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Fundraiseup API Client for handling authentication and paginated requests."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Iterator, Optional
|
|
4
|
+
|
|
5
|
+
from omniload.src.http_client import create_client
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FundraiseupClient:
|
|
9
|
+
"""Client for interacting with Fundraiseup API v1."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, api_key: str):
|
|
12
|
+
"""
|
|
13
|
+
Initialize Fundraiseup API client.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
api_key: API key for authentication
|
|
17
|
+
"""
|
|
18
|
+
self.api_key = api_key
|
|
19
|
+
self.base_url = "https://api.fundraiseup.com/v1"
|
|
20
|
+
# Use shared HTTP client with retry logic for rate limiting
|
|
21
|
+
self.client = create_client(retry_status_codes=[429, 500, 502, 503, 504])
|
|
22
|
+
|
|
23
|
+
def get_paginated_data(
|
|
24
|
+
self,
|
|
25
|
+
endpoint: str,
|
|
26
|
+
params: Optional[Dict[str, Any]] = None,
|
|
27
|
+
page_size: int = 100,
|
|
28
|
+
) -> Iterator[list[Dict[str, Any]]]:
|
|
29
|
+
"""
|
|
30
|
+
Fetch paginated data from a Fundraiseup API endpoint using cursor-based pagination.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
endpoint: API endpoint path (e.g., "donations")
|
|
34
|
+
params: Additional query parameters
|
|
35
|
+
page_size: Number of items per page (default 100)
|
|
36
|
+
|
|
37
|
+
Yields:
|
|
38
|
+
Batches of items from the API
|
|
39
|
+
"""
|
|
40
|
+
url = f"{self.base_url}/{endpoint}"
|
|
41
|
+
headers = {
|
|
42
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
43
|
+
"Content-Type": "application/json",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if params is None:
|
|
47
|
+
params = {}
|
|
48
|
+
|
|
49
|
+
params["limit"] = page_size
|
|
50
|
+
starting_after = None
|
|
51
|
+
|
|
52
|
+
while True:
|
|
53
|
+
# Add cursor for pagination if not first page
|
|
54
|
+
if starting_after:
|
|
55
|
+
params["starting_after"] = starting_after
|
|
56
|
+
|
|
57
|
+
response = self.client.get(url=url, headers=headers, params=params)
|
|
58
|
+
response.raise_for_status()
|
|
59
|
+
|
|
60
|
+
data = response.json()
|
|
61
|
+
|
|
62
|
+
# Handle both list response and object with data array
|
|
63
|
+
if isinstance(data, list):
|
|
64
|
+
items = data
|
|
65
|
+
has_more = len(items) == page_size
|
|
66
|
+
else:
|
|
67
|
+
items = data.get("data", [])
|
|
68
|
+
has_more = data.get("has_more", False)
|
|
69
|
+
|
|
70
|
+
if not items:
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
yield items
|
|
74
|
+
|
|
75
|
+
# Set cursor for next page
|
|
76
|
+
if has_more and items:
|
|
77
|
+
starting_after = items[-1].get("id")
|
|
78
|
+
if not starting_after:
|
|
79
|
+
break
|
|
80
|
+
else:
|
|
81
|
+
break
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# Copyright 2022-2025 ScaleVector
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Source that load github issues, pull requests and reactions for a specific repository via customizable graphql query. Loads events incrementally."""
|
|
16
|
+
|
|
17
|
+
import urllib.parse
|
|
18
|
+
from typing import Iterator, Optional, Sequence
|
|
19
|
+
|
|
20
|
+
import dlt
|
|
21
|
+
import pendulum
|
|
22
|
+
from dlt.common.typing import TDataItems
|
|
23
|
+
from dlt.sources import DltResource
|
|
24
|
+
|
|
25
|
+
from .helpers import get_reactions_data, get_rest_pages, get_stargazers
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dlt.source(max_table_nesting=0)
|
|
29
|
+
def github_reactions(
|
|
30
|
+
owner: str,
|
|
31
|
+
name: str,
|
|
32
|
+
access_token: str,
|
|
33
|
+
items_per_page: int = 100,
|
|
34
|
+
max_items: Optional[int] = None,
|
|
35
|
+
) -> Sequence[DltResource]:
|
|
36
|
+
"""Get reactions associated with issues, pull requests and comments in the repo `name` with owner `owner`.
|
|
37
|
+
|
|
38
|
+
This source uses graphql to retrieve all issues (`issues` resource) and pull requests (`pull requests` resource) with the associated reactions (up to 100),
|
|
39
|
+
comments (up to 100) and reactions to comments (also up to 100). Internally graphql is used to retrieve data. It is cost optimized and you are able to retrieve the
|
|
40
|
+
data for fairly large repos quickly and cheaply.
|
|
41
|
+
You can and should change the queries in `queries.py` to include for example additional fields or connections. The source can be hacked to add more resources for other
|
|
42
|
+
repository nodes easily.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
owner (str): The repository owner
|
|
46
|
+
name (str): The repository name
|
|
47
|
+
access_token (str): The classic access token. Will be injected from secrets if not provided.
|
|
48
|
+
items_per_page (int, optional): How many issues/pull requests to get in single page. Defaults to 100.
|
|
49
|
+
max_items (int, optional): How many issues/pull requests to get in total. None means All.
|
|
50
|
+
max_item_age_seconds (float, optional): Do not get items older than this. Defaults to None. NOT IMPLEMENTED
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Sequence[DltResource]: Two DltResources: `issues` with issues and `pull_requests` with pull requests
|
|
54
|
+
"""
|
|
55
|
+
return (
|
|
56
|
+
dlt.resource(
|
|
57
|
+
get_reactions_data(
|
|
58
|
+
"issues",
|
|
59
|
+
owner,
|
|
60
|
+
name,
|
|
61
|
+
access_token,
|
|
62
|
+
items_per_page,
|
|
63
|
+
max_items,
|
|
64
|
+
),
|
|
65
|
+
name="issues",
|
|
66
|
+
write_disposition="replace",
|
|
67
|
+
),
|
|
68
|
+
dlt.resource(
|
|
69
|
+
get_reactions_data(
|
|
70
|
+
"pullRequests",
|
|
71
|
+
owner,
|
|
72
|
+
name,
|
|
73
|
+
access_token,
|
|
74
|
+
items_per_page,
|
|
75
|
+
max_items,
|
|
76
|
+
),
|
|
77
|
+
name="pull_requests",
|
|
78
|
+
write_disposition="replace",
|
|
79
|
+
),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dlt.source(max_table_nesting=0)
|
|
84
|
+
def github_repo_events(
|
|
85
|
+
owner: str,
|
|
86
|
+
name: str,
|
|
87
|
+
access_token: str,
|
|
88
|
+
start_date: pendulum.DateTime,
|
|
89
|
+
end_date: Optional[pendulum.DateTime] = None,
|
|
90
|
+
) -> DltResource:
|
|
91
|
+
"""Gets events for repository `name` with owner `owner` incrementally.
|
|
92
|
+
|
|
93
|
+
This source contains a single resource `repo_events` that gets given repository's events and dispatches them to separate tables with names based on event type.
|
|
94
|
+
The data is loaded incrementally. Subsequent runs will get only new events and append them to tables.
|
|
95
|
+
Please note that Github allows only for 300 events to be retrieved for public repositories. You should get the events frequently for the active repos.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
owner (str): The repository owner
|
|
99
|
+
name (str): The repository name
|
|
100
|
+
access_token (str): The classic or fine-grained access token. If not provided, calls are made anonymously
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
DltSource: source with the `repo_events` resource
|
|
104
|
+
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
# use naming function in table name to generate separate tables for each event
|
|
108
|
+
@dlt.resource(
|
|
109
|
+
primary_key="id", table_name=lambda i: i["type"], write_disposition="merge"
|
|
110
|
+
)
|
|
111
|
+
def repo_events(
|
|
112
|
+
last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
113
|
+
"created_at",
|
|
114
|
+
initial_value=start_date.isoformat(),
|
|
115
|
+
end_value=end_date.isoformat() if end_date else None,
|
|
116
|
+
last_value_func=max,
|
|
117
|
+
range_end="closed",
|
|
118
|
+
range_start="closed",
|
|
119
|
+
),
|
|
120
|
+
) -> Iterator[TDataItems]:
|
|
121
|
+
repos_path = (
|
|
122
|
+
f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Get the date range from the incremental state
|
|
126
|
+
start_filter = pendulum.parse(
|
|
127
|
+
last_created_at.last_value or last_created_at.initial_value
|
|
128
|
+
)
|
|
129
|
+
end_filter = (
|
|
130
|
+
pendulum.parse(last_created_at.end_value)
|
|
131
|
+
if last_created_at.end_value
|
|
132
|
+
else pendulum.now()
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
|
|
136
|
+
# Filter events by date range
|
|
137
|
+
filtered_events = []
|
|
138
|
+
for event in page:
|
|
139
|
+
event_date = pendulum.parse(event["created_at"])
|
|
140
|
+
|
|
141
|
+
# Check if event is within the date range
|
|
142
|
+
if event_date >= start_filter:
|
|
143
|
+
if end_filter is None or event_date <= end_filter:
|
|
144
|
+
filtered_events.append(event)
|
|
145
|
+
elif event_date > end_filter:
|
|
146
|
+
# Skip events that are newer than our end date
|
|
147
|
+
continue
|
|
148
|
+
else:
|
|
149
|
+
# Events are ordered by date desc, so if we hit an older event, we can stop
|
|
150
|
+
break
|
|
151
|
+
|
|
152
|
+
if filtered_events:
|
|
153
|
+
yield filtered_events
|
|
154
|
+
|
|
155
|
+
# stop requesting pages if the last element was already older than initial value
|
|
156
|
+
# note: incremental will skip those items anyway, we just do not want to use the api limits
|
|
157
|
+
if last_created_at.start_out_of_range:
|
|
158
|
+
print(
|
|
159
|
+
f"Overlap with previous run created at {last_created_at.initial_value}"
|
|
160
|
+
)
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
return repo_events
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dlt.source(max_table_nesting=0)
|
|
167
|
+
def github_stargazers(
|
|
168
|
+
owner: str,
|
|
169
|
+
name: str,
|
|
170
|
+
access_token: str,
|
|
171
|
+
items_per_page: int = 100,
|
|
172
|
+
max_items: Optional[int] = None,
|
|
173
|
+
) -> Sequence[DltResource]:
|
|
174
|
+
"""Get stargazers in the repo `name` with owner `owner`.
|
|
175
|
+
|
|
176
|
+
This source uses graphql to retrieve all stargazers with the associated starred date,
|
|
177
|
+
Internally graphql is used to retrieve data. It is cost optimized and you are able to retrieve the
|
|
178
|
+
data for fairly large repos quickly and cheaply.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
owner (str): The repository owner
|
|
182
|
+
name (str): The repository name
|
|
183
|
+
access_token (str): The classic access token. Will be injected from secrets if not provided.
|
|
184
|
+
items_per_page (int, optional): How many issues/pull requests to get in single page. Defaults to 100.
|
|
185
|
+
max_items (int, optional): How many issues/pull requests to get in total. None means All.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Sequence[DltResource]: One DltResource: `stargazers`
|
|
189
|
+
"""
|
|
190
|
+
return (
|
|
191
|
+
dlt.resource(
|
|
192
|
+
get_stargazers(
|
|
193
|
+
owner,
|
|
194
|
+
name,
|
|
195
|
+
access_token,
|
|
196
|
+
items_per_page,
|
|
197
|
+
max_items,
|
|
198
|
+
),
|
|
199
|
+
name="stargazers",
|
|
200
|
+
write_disposition="replace",
|
|
201
|
+
),
|
|
202
|
+
)
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Copyright 2022-2025 ScaleVector
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Iterator, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
from dlt.common.typing import DictStrAny, StrAny
|
|
18
|
+
from dlt.common.utils import chunks
|
|
19
|
+
from dlt.sources.helpers import requests
|
|
20
|
+
|
|
21
|
+
from .queries import COMMENT_REACTIONS_QUERY, ISSUES_QUERY, RATE_LIMIT, STARGAZERS_QUERY
|
|
22
|
+
from .settings import GRAPHQL_API_BASE_URL, REST_API_BASE_URL
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#
|
|
26
|
+
# Shared
|
|
27
|
+
#
|
|
28
|
+
def _get_auth_header(access_token: Optional[str]) -> StrAny:
|
|
29
|
+
if access_token:
|
|
30
|
+
return {"Authorization": f"Bearer {access_token}"}
|
|
31
|
+
else:
|
|
32
|
+
# REST API works without access token (with high rate limits)
|
|
33
|
+
return {}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
#
|
|
37
|
+
# Rest API helpers
|
|
38
|
+
#
|
|
39
|
+
def get_rest_pages(access_token: Optional[str], query: str) -> Iterator[List[StrAny]]:
|
|
40
|
+
def _request(page_url: str) -> requests.Response:
|
|
41
|
+
r = requests.get(page_url, headers=_get_auth_header(access_token))
|
|
42
|
+
print(
|
|
43
|
+
f"got page {page_url}, requests left: " + r.headers["x-ratelimit-remaining"]
|
|
44
|
+
)
|
|
45
|
+
return r
|
|
46
|
+
|
|
47
|
+
next_page_url = REST_API_BASE_URL + query
|
|
48
|
+
while True:
|
|
49
|
+
r: requests.Response = _request(next_page_url)
|
|
50
|
+
page_items = r.json()
|
|
51
|
+
if len(page_items) == 0:
|
|
52
|
+
break
|
|
53
|
+
yield page_items
|
|
54
|
+
if "next" not in r.links:
|
|
55
|
+
break
|
|
56
|
+
next_page_url = r.links["next"]["url"]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
#
|
|
60
|
+
# GraphQL API helpers
|
|
61
|
+
#
|
|
62
|
+
def get_stargazers(
|
|
63
|
+
owner: str,
|
|
64
|
+
name: str,
|
|
65
|
+
access_token: str,
|
|
66
|
+
items_per_page: int,
|
|
67
|
+
max_items: Optional[int],
|
|
68
|
+
) -> Iterator[Iterator[StrAny]]:
|
|
69
|
+
variables = {"owner": owner, "name": name, "items_per_page": items_per_page}
|
|
70
|
+
for page_items in _get_graphql_pages(
|
|
71
|
+
access_token, STARGAZERS_QUERY, variables, "stargazers", max_items
|
|
72
|
+
):
|
|
73
|
+
yield map(
|
|
74
|
+
lambda item: {"starredAt": item["starredAt"], "user": item["node"]},
|
|
75
|
+
page_items,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_reactions_data(
|
|
80
|
+
node_type: str,
|
|
81
|
+
owner: str,
|
|
82
|
+
name: str,
|
|
83
|
+
access_token: str,
|
|
84
|
+
items_per_page: int,
|
|
85
|
+
max_items: Optional[int],
|
|
86
|
+
) -> Iterator[Iterator[StrAny]]:
|
|
87
|
+
variables = {
|
|
88
|
+
"owner": owner,
|
|
89
|
+
"name": name,
|
|
90
|
+
"issues_per_page": items_per_page,
|
|
91
|
+
"first_reactions": 100,
|
|
92
|
+
"first_comments": 100,
|
|
93
|
+
"node_type": node_type,
|
|
94
|
+
}
|
|
95
|
+
for page_items in _get_graphql_pages(
|
|
96
|
+
access_token, ISSUES_QUERY % node_type, variables, node_type, max_items
|
|
97
|
+
):
|
|
98
|
+
# use reactionGroups to query for reactions to comments that have any reactions. reduces cost by 10-50x
|
|
99
|
+
reacted_comment_ids = {}
|
|
100
|
+
for item in page_items:
|
|
101
|
+
for comment in item["comments"]["nodes"]:
|
|
102
|
+
if any(group["createdAt"] for group in comment["reactionGroups"]):
|
|
103
|
+
# print(f"for comment {comment['id']}: has reaction")
|
|
104
|
+
reacted_comment_ids[comment["id"]] = comment
|
|
105
|
+
# if "reactionGroups" in comment:
|
|
106
|
+
comment.pop("reactionGroups", None)
|
|
107
|
+
|
|
108
|
+
# get comment reactions by querying comment nodes separately
|
|
109
|
+
comment_reactions = _get_comment_reaction(
|
|
110
|
+
list(reacted_comment_ids.keys()), access_token
|
|
111
|
+
)
|
|
112
|
+
# attach the reaction nodes where they should be
|
|
113
|
+
for comment in comment_reactions.values():
|
|
114
|
+
comment_id = comment["id"]
|
|
115
|
+
reacted_comment_ids[comment_id]["reactions"] = comment["reactions"]
|
|
116
|
+
yield map(_extract_nested_nodes, page_items)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _extract_top_connection(data: StrAny, node_type: str) -> StrAny:
|
|
120
|
+
assert isinstance(data, dict) and len(data) == 1, (
|
|
121
|
+
f"The data with list of {node_type} must be a dictionary and contain only one element"
|
|
122
|
+
)
|
|
123
|
+
data = next(iter(data.values()))
|
|
124
|
+
return data[node_type] # type: ignore
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _extract_nested_nodes(item: DictStrAny) -> DictStrAny:
|
|
128
|
+
"""Recursively moves `nodes` and `totalCount` to reduce nesting."""
|
|
129
|
+
item["reactions_totalCount"] = item["reactions"].get("totalCount", 0)
|
|
130
|
+
item["reactions"] = item["reactions"]["nodes"]
|
|
131
|
+
comments = item["comments"]
|
|
132
|
+
item["comments_totalCount"] = item["comments"].get("totalCount", 0)
|
|
133
|
+
for comment in comments["nodes"]:
|
|
134
|
+
if "reactions" in comment:
|
|
135
|
+
comment["reactions_totalCount"] = comment["reactions"].get("totalCount", 0)
|
|
136
|
+
comment["reactions"] = comment["reactions"]["nodes"]
|
|
137
|
+
item["comments"] = comments["nodes"]
|
|
138
|
+
return item
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _run_graphql_query(
|
|
142
|
+
access_token: str, query: str, variables: DictStrAny
|
|
143
|
+
) -> Tuple[StrAny, StrAny]:
|
|
144
|
+
def _request() -> requests.Response:
|
|
145
|
+
r = requests.post(
|
|
146
|
+
GRAPHQL_API_BASE_URL,
|
|
147
|
+
json={"query": query, "variables": variables},
|
|
148
|
+
headers=_get_auth_header(access_token),
|
|
149
|
+
)
|
|
150
|
+
return r
|
|
151
|
+
|
|
152
|
+
data = _request().json()
|
|
153
|
+
if "errors" in data:
|
|
154
|
+
raise ValueError(data)
|
|
155
|
+
data = data["data"]
|
|
156
|
+
# pop rate limits
|
|
157
|
+
rate_limit = data.pop("rateLimit", {"cost": 0, "remaining": 0})
|
|
158
|
+
return data, rate_limit
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _get_graphql_pages(
|
|
162
|
+
access_token: str, query: str, variables: DictStrAny, node_type: str, max_items: int
|
|
163
|
+
) -> Iterator[List[DictStrAny]]:
|
|
164
|
+
items_count = 0
|
|
165
|
+
while True:
|
|
166
|
+
data, rate_limit = _run_graphql_query(access_token, query, variables)
|
|
167
|
+
top_connection = _extract_top_connection(data, node_type)
|
|
168
|
+
data_items = (
|
|
169
|
+
top_connection["nodes"]
|
|
170
|
+
if "nodes" in top_connection
|
|
171
|
+
else top_connection["edges"]
|
|
172
|
+
)
|
|
173
|
+
items_count += len(data_items)
|
|
174
|
+
print(
|
|
175
|
+
f"Got {len(data_items)}/{items_count} {node_type}s, query cost {rate_limit['cost']}, remaining credits: {rate_limit['remaining']}"
|
|
176
|
+
)
|
|
177
|
+
if data_items:
|
|
178
|
+
yield data_items
|
|
179
|
+
else:
|
|
180
|
+
return
|
|
181
|
+
# print(data["repository"][node_type]["pageInfo"]["endCursor"])
|
|
182
|
+
variables["page_after"] = _extract_top_connection(data, node_type)["pageInfo"][
|
|
183
|
+
"endCursor"
|
|
184
|
+
]
|
|
185
|
+
if max_items and items_count >= max_items:
|
|
186
|
+
print(f"Max items limit reached: {items_count} >= {max_items}")
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _get_comment_reaction(comment_ids: List[str], access_token: str) -> StrAny:
|
|
191
|
+
"""Builds a query from a list of comment nodes and returns associated reactions."""
|
|
192
|
+
idx = 0
|
|
193
|
+
data: DictStrAny = {}
|
|
194
|
+
for page_chunk in chunks(comment_ids, 50):
|
|
195
|
+
subs = []
|
|
196
|
+
for comment_id in page_chunk:
|
|
197
|
+
subs.append(COMMENT_REACTIONS_QUERY % (idx, comment_id))
|
|
198
|
+
idx += 1
|
|
199
|
+
subs.append(RATE_LIMIT)
|
|
200
|
+
query = "{" + ",\n".join(subs) + "}"
|
|
201
|
+
# print(query)
|
|
202
|
+
page, rate_limit = _run_graphql_query(access_token, query, {})
|
|
203
|
+
print(
|
|
204
|
+
f"Got {len(page)} comments, query cost {rate_limit['cost']}, remaining credits: {rate_limit['remaining']}"
|
|
205
|
+
)
|
|
206
|
+
data.update(page)
|
|
207
|
+
return data
|