ingestr 0.12.3__py3-none-any.whl → 0.12.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +53 -5
- ingestr/src/arrow/__init__.py +0 -4
- ingestr/src/factory.py +4 -0
- ingestr/src/github/__init__.py +149 -0
- ingestr/src/github/helpers.py +193 -0
- ingestr/src/github/queries.py +115 -0
- ingestr/src/github/settings.py +10 -0
- ingestr/src/google_analytics/__init__.py +70 -0
- ingestr/src/google_analytics/helpers/__init__.py +70 -0
- ingestr/src/google_analytics/helpers/data_processing.py +176 -0
- ingestr/src/sources.py +123 -47
- ingestr/src/sql_database/__init__.py +0 -0
- ingestr/src/sql_database/callbacks.py +66 -0
- ingestr/src/version.py +1 -1
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/METADATA +4 -3
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/RECORD +19 -10
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/WHEEL +0 -0
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/entry_points.txt +0 -0
- {ingestr-0.12.3.dist-info → ingestr-0.12.5.dist-info}/licenses/LICENSE.md +0 -0
ingestr/main.py
CHANGED
|
@@ -32,7 +32,7 @@ DATE_FORMATS = [
|
|
|
32
32
|
|
|
33
33
|
# https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
|
|
34
34
|
PARQUET_SUPPORTED_DESTINATIONS = [
|
|
35
|
-
"bigquery",
|
|
35
|
+
"athena" "bigquery",
|
|
36
36
|
"duckdb",
|
|
37
37
|
"snowflake",
|
|
38
38
|
"databricks",
|
|
@@ -57,8 +57,9 @@ class SpinnerCollector(Collector):
|
|
|
57
57
|
name: str,
|
|
58
58
|
inc: int = 1,
|
|
59
59
|
total: Optional[int] = None,
|
|
60
|
-
message: Optional[str] = None,
|
|
60
|
+
message: Optional[str] = None, # type: ignore
|
|
61
61
|
label: str = "",
|
|
62
|
+
**kwargs,
|
|
62
63
|
) -> None:
|
|
63
64
|
self.status.update(self.current_step)
|
|
64
65
|
|
|
@@ -287,8 +288,14 @@ def ingest(
|
|
|
287
288
|
envvar="SQL_EXCLUDE_COLUMNS",
|
|
288
289
|
),
|
|
289
290
|
] = [], # type: ignore
|
|
291
|
+
columns: Annotated[
|
|
292
|
+
Optional[list[str]],
|
|
293
|
+
typer.Option(
|
|
294
|
+
help="The column types to be used for the destination table in the format of 'column_name:column_type'",
|
|
295
|
+
envvar="COLUMNS",
|
|
296
|
+
),
|
|
297
|
+
] = None, # type: ignore
|
|
290
298
|
):
|
|
291
|
-
# TODO(turtledev): can't we move this to the top of this file?
|
|
292
299
|
import hashlib
|
|
293
300
|
import tempfile
|
|
294
301
|
from datetime import datetime
|
|
@@ -296,6 +303,7 @@ def ingest(
|
|
|
296
303
|
import dlt
|
|
297
304
|
import humanize
|
|
298
305
|
import typer
|
|
306
|
+
from dlt.common.data_types import TDataType
|
|
299
307
|
from dlt.common.destination import Destination
|
|
300
308
|
from dlt.common.pipeline import LoadInfo
|
|
301
309
|
from dlt.common.runtime.collector import Collector, LogCollector
|
|
@@ -345,7 +353,7 @@ def ingest(
|
|
|
345
353
|
not in dlt_dest.capabilities().supported_loader_file_formats
|
|
346
354
|
):
|
|
347
355
|
print(
|
|
348
|
-
f"[red]Loader file format {loader_file_format.value} is not supported by the destination.[/red]"
|
|
356
|
+
f"[red]Loader file format {loader_file_format.value} is not supported by the destination, available formats: {dlt_dest.capabilities().supported_loader_file_formats}.[/red]"
|
|
349
357
|
)
|
|
350
358
|
raise typer.Abort()
|
|
351
359
|
|
|
@@ -357,6 +365,23 @@ def ingest(
|
|
|
357
365
|
else:
|
|
358
366
|
executable(source)
|
|
359
367
|
|
|
368
|
+
def parse_columns(columns: list[str]) -> dict[str, TDataType]:
|
|
369
|
+
from typing import cast, get_args
|
|
370
|
+
|
|
371
|
+
possible_types = get_args(TDataType)
|
|
372
|
+
|
|
373
|
+
types: dict[str, TDataType] = {}
|
|
374
|
+
for column in columns:
|
|
375
|
+
for candidate in column.split(","):
|
|
376
|
+
column_name, column_type = candidate.split(":")
|
|
377
|
+
if column_type not in possible_types:
|
|
378
|
+
print(
|
|
379
|
+
f"[red]Column type '{column_type}' is not supported, supported types: {possible_types}.[/red]"
|
|
380
|
+
)
|
|
381
|
+
raise typer.Abort()
|
|
382
|
+
types[column_name] = cast(TDataType, column_type)
|
|
383
|
+
return types
|
|
384
|
+
|
|
360
385
|
track(
|
|
361
386
|
"command_triggered",
|
|
362
387
|
{
|
|
@@ -399,12 +424,20 @@ def ingest(
|
|
|
399
424
|
column_hints: dict[str, TColumnSchema] = {}
|
|
400
425
|
original_incremental_strategy = incremental_strategy
|
|
401
426
|
|
|
427
|
+
if columns:
|
|
428
|
+
column_types = parse_columns(columns)
|
|
429
|
+
for column_name, column_type in column_types.items():
|
|
430
|
+
column_hints[column_name] = {"data_type": column_type}
|
|
431
|
+
|
|
402
432
|
merge_key = None
|
|
403
433
|
if incremental_strategy == IncrementalStrategy.delete_insert:
|
|
404
434
|
merge_key = incremental_key
|
|
405
435
|
incremental_strategy = IncrementalStrategy.merge
|
|
406
436
|
if incremental_key:
|
|
407
|
-
|
|
437
|
+
if incremental_key not in column_hints:
|
|
438
|
+
column_hints[incremental_key] = {}
|
|
439
|
+
|
|
440
|
+
column_hints[incremental_key]["merge_key"] = True
|
|
408
441
|
|
|
409
442
|
m = hashlib.sha256()
|
|
410
443
|
m.update(dest_table.encode("utf-8"))
|
|
@@ -491,6 +524,21 @@ def ingest(
|
|
|
491
524
|
if factory.source_scheme == "sqlite":
|
|
492
525
|
source_table = "main." + source_table.split(".")[-1]
|
|
493
526
|
|
|
527
|
+
if (
|
|
528
|
+
incremental_key
|
|
529
|
+
and incremental_key in column_hints
|
|
530
|
+
and "data_type" in column_hints[incremental_key]
|
|
531
|
+
and column_hints[incremental_key]["data_type"] == "date"
|
|
532
|
+
):
|
|
533
|
+
# By default, ingestr treats the start and end dates as datetime objects. While this worked fine for many cases, if the
|
|
534
|
+
# incremental field is a date, the start and end dates cannot be compared to the incremental field, and the ingestion would fail.
|
|
535
|
+
# In order to eliminate this, we have introduced a new option to ingestr, --columns, which allows the user to specify the column types for the destination table.
|
|
536
|
+
# This way, ingestr will know the data type of the incremental field, and will be able to convert the start and end dates to the correct data type before running the ingestion.
|
|
537
|
+
if interval_start:
|
|
538
|
+
interval_start = interval_start.date() # type: ignore
|
|
539
|
+
if interval_end:
|
|
540
|
+
interval_end = interval_end.date() # type: ignore
|
|
541
|
+
|
|
494
542
|
dlt_source = source.dlt_source(
|
|
495
543
|
uri=source_uri,
|
|
496
544
|
table=source_table,
|
ingestr/src/arrow/__init__.py
CHANGED
ingestr/src/factory.py
CHANGED
|
@@ -24,6 +24,8 @@ from ingestr.src.sources import (
|
|
|
24
24
|
ChessSource,
|
|
25
25
|
DynamoDBSource,
|
|
26
26
|
FacebookAdsSource,
|
|
27
|
+
GitHubSource,
|
|
28
|
+
GoogleAnalyticsSource,
|
|
27
29
|
GoogleSheetsSource,
|
|
28
30
|
GorgiasSource,
|
|
29
31
|
HubspotSource,
|
|
@@ -102,6 +104,7 @@ class SourceDestinationFactory:
|
|
|
102
104
|
"gsheets": GoogleSheetsSource,
|
|
103
105
|
"shopify": ShopifySource,
|
|
104
106
|
"gorgias": GorgiasSource,
|
|
107
|
+
"github": GitHubSource,
|
|
105
108
|
"chess": ChessSource,
|
|
106
109
|
"stripe": StripeAnalyticsSource,
|
|
107
110
|
"facebookads": FacebookAdsSource,
|
|
@@ -118,6 +121,7 @@ class SourceDestinationFactory:
|
|
|
118
121
|
"dynamodb": DynamoDBSource,
|
|
119
122
|
"asana": AsanaSource,
|
|
120
123
|
"tiktok": TikTokSource,
|
|
124
|
+
"googleanalytics": GoogleAnalyticsSource,
|
|
121
125
|
}
|
|
122
126
|
destinations: Dict[str, Type[DestinationProtocol]] = {
|
|
123
127
|
"bigquery": BigQueryDestination,
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Source that load github issues, pull requests and reactions for a specific repository via customizable graphql query. Loads events incrementally."""
|
|
2
|
+
|
|
3
|
+
import urllib.parse
|
|
4
|
+
from typing import Iterator, Optional, Sequence
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from dlt.common.typing import TDataItems
|
|
8
|
+
from dlt.sources import DltResource
|
|
9
|
+
|
|
10
|
+
from .helpers import get_reactions_data, get_rest_pages, get_stargazers
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dlt.source
|
|
14
|
+
def github_reactions(
|
|
15
|
+
owner: str,
|
|
16
|
+
name: str,
|
|
17
|
+
access_token: str = dlt.secrets.value,
|
|
18
|
+
items_per_page: int = 100,
|
|
19
|
+
max_items: Optional[int] = None,
|
|
20
|
+
) -> Sequence[DltResource]:
|
|
21
|
+
"""Get reactions associated with issues, pull requests and comments in the repo `name` with owner `owner`.
|
|
22
|
+
|
|
23
|
+
This source uses graphql to retrieve all issues (`issues` resource) and pull requests (`pull requests` resource) with the associated reactions (up to 100),
|
|
24
|
+
comments (up to 100) and reactions to comments (also up to 100). Internally graphql is used to retrieve data. It is cost optimized and you are able to retrieve the
|
|
25
|
+
data for fairly large repos quickly and cheaply.
|
|
26
|
+
You can and should change the queries in `queries.py` to include for example additional fields or connections. The source can be hacked to add more resources for other
|
|
27
|
+
repository nodes easily.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
owner (str): The repository owner
|
|
31
|
+
name (str): The repository name
|
|
32
|
+
access_token (str): The classic access token. Will be injected from secrets if not provided.
|
|
33
|
+
items_per_page (int, optional): How many issues/pull requests to get in single page. Defaults to 100.
|
|
34
|
+
max_items (int, optional): How many issues/pull requests to get in total. None means All.
|
|
35
|
+
max_item_age_seconds (float, optional): Do not get items older than this. Defaults to None. NOT IMPLEMENTED
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Sequence[DltResource]: Two DltResources: `issues` with issues and `pull_requests` with pull requests
|
|
39
|
+
"""
|
|
40
|
+
return (
|
|
41
|
+
dlt.resource(
|
|
42
|
+
get_reactions_data(
|
|
43
|
+
"issues",
|
|
44
|
+
owner,
|
|
45
|
+
name,
|
|
46
|
+
access_token,
|
|
47
|
+
items_per_page,
|
|
48
|
+
max_items,
|
|
49
|
+
),
|
|
50
|
+
name="issues",
|
|
51
|
+
write_disposition="replace",
|
|
52
|
+
),
|
|
53
|
+
dlt.resource(
|
|
54
|
+
get_reactions_data(
|
|
55
|
+
"pullRequests",
|
|
56
|
+
owner,
|
|
57
|
+
name,
|
|
58
|
+
access_token,
|
|
59
|
+
items_per_page,
|
|
60
|
+
max_items,
|
|
61
|
+
),
|
|
62
|
+
name="pull_requests",
|
|
63
|
+
write_disposition="replace",
|
|
64
|
+
),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dlt.source(max_table_nesting=0)
|
|
69
|
+
def github_repo_events(
|
|
70
|
+
owner: str, name: str, access_token: Optional[str] = None
|
|
71
|
+
) -> DltResource:
|
|
72
|
+
"""Gets events for repository `name` with owner `owner` incrementally.
|
|
73
|
+
|
|
74
|
+
This source contains a single resource `repo_events` that gets given repository's events and dispatches them to separate tables with names based on event type.
|
|
75
|
+
The data is loaded incrementally. Subsequent runs will get only new events and append them to tables.
|
|
76
|
+
Please note that Github allows only for 300 events to be retrieved for public repositories. You should get the events frequently for the active repos.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
owner (str): The repository owner
|
|
80
|
+
name (str): The repository name
|
|
81
|
+
access_token (str): The classic or fine-grained access token. If not provided, calls are made anonymously
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
DltSource: source with the `repo_events` resource
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
# use naming function in table name to generate separate tables for each event
|
|
89
|
+
@dlt.resource(primary_key="id", table_name=lambda i: i["type"])
|
|
90
|
+
def repo_events(
|
|
91
|
+
last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
92
|
+
"created_at", initial_value="1970-01-01T00:00:00Z", last_value_func=max
|
|
93
|
+
),
|
|
94
|
+
) -> Iterator[TDataItems]:
|
|
95
|
+
repos_path = (
|
|
96
|
+
f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
|
|
100
|
+
yield page
|
|
101
|
+
|
|
102
|
+
# stop requesting pages if the last element was already older than initial value
|
|
103
|
+
# note: incremental will skip those items anyway, we just do not want to use the api limits
|
|
104
|
+
if last_created_at.start_out_of_range:
|
|
105
|
+
print(
|
|
106
|
+
f"Overlap with previous run created at {last_created_at.initial_value}"
|
|
107
|
+
)
|
|
108
|
+
break
|
|
109
|
+
|
|
110
|
+
return repo_events
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dlt.source
|
|
114
|
+
def github_stargazers(
|
|
115
|
+
owner: str,
|
|
116
|
+
name: str,
|
|
117
|
+
access_token: str = dlt.secrets.value,
|
|
118
|
+
items_per_page: int = 100,
|
|
119
|
+
max_items: Optional[int] = None,
|
|
120
|
+
) -> Sequence[DltResource]:
|
|
121
|
+
"""Get stargazers in the repo `name` with owner `owner`.
|
|
122
|
+
|
|
123
|
+
This source uses graphql to retrieve all stargazers with the associated starred date,
|
|
124
|
+
Internally graphql is used to retrieve data. It is cost optimized and you are able to retrieve the
|
|
125
|
+
data for fairly large repos quickly and cheaply.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
owner (str): The repository owner
|
|
129
|
+
name (str): The repository name
|
|
130
|
+
access_token (str): The classic access token. Will be injected from secrets if not provided.
|
|
131
|
+
items_per_page (int, optional): How many issues/pull requests to get in single page. Defaults to 100.
|
|
132
|
+
max_items (int, optional): How many issues/pull requests to get in total. None means All.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Sequence[DltResource]: One DltResource: `stargazers`
|
|
136
|
+
"""
|
|
137
|
+
return (
|
|
138
|
+
dlt.resource(
|
|
139
|
+
get_stargazers(
|
|
140
|
+
owner,
|
|
141
|
+
name,
|
|
142
|
+
access_token,
|
|
143
|
+
items_per_page,
|
|
144
|
+
max_items,
|
|
145
|
+
),
|
|
146
|
+
name="stargazers",
|
|
147
|
+
write_disposition="replace",
|
|
148
|
+
),
|
|
149
|
+
)
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from typing import Iterator, List, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
from dlt.common.typing import DictStrAny, StrAny
|
|
4
|
+
from dlt.common.utils import chunks
|
|
5
|
+
from dlt.sources.helpers import requests
|
|
6
|
+
|
|
7
|
+
from .queries import COMMENT_REACTIONS_QUERY, ISSUES_QUERY, RATE_LIMIT, STARGAZERS_QUERY
|
|
8
|
+
from .settings import GRAPHQL_API_BASE_URL, REST_API_BASE_URL
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
#
|
|
12
|
+
# Shared
|
|
13
|
+
#
|
|
14
|
+
def _get_auth_header(access_token: Optional[str]) -> StrAny:
|
|
15
|
+
if access_token:
|
|
16
|
+
return {"Authorization": f"Bearer {access_token}"}
|
|
17
|
+
else:
|
|
18
|
+
# REST API works without access token (with high rate limits)
|
|
19
|
+
return {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
#
|
|
23
|
+
# Rest API helpers
|
|
24
|
+
#
|
|
25
|
+
def get_rest_pages(access_token: Optional[str], query: str) -> Iterator[List[StrAny]]:
|
|
26
|
+
def _request(page_url: str) -> requests.Response:
|
|
27
|
+
r = requests.get(page_url, headers=_get_auth_header(access_token))
|
|
28
|
+
print(
|
|
29
|
+
f"got page {page_url}, requests left: " + r.headers["x-ratelimit-remaining"]
|
|
30
|
+
)
|
|
31
|
+
return r
|
|
32
|
+
|
|
33
|
+
next_page_url = REST_API_BASE_URL + query
|
|
34
|
+
while True:
|
|
35
|
+
r: requests.Response = _request(next_page_url)
|
|
36
|
+
page_items = r.json()
|
|
37
|
+
if len(page_items) == 0:
|
|
38
|
+
break
|
|
39
|
+
yield page_items
|
|
40
|
+
if "next" not in r.links:
|
|
41
|
+
break
|
|
42
|
+
next_page_url = r.links["next"]["url"]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
#
|
|
46
|
+
# GraphQL API helpers
|
|
47
|
+
#
|
|
48
|
+
def get_stargazers(
|
|
49
|
+
owner: str,
|
|
50
|
+
name: str,
|
|
51
|
+
access_token: str,
|
|
52
|
+
items_per_page: int,
|
|
53
|
+
max_items: Optional[int],
|
|
54
|
+
) -> Iterator[Iterator[StrAny]]:
|
|
55
|
+
variables = {"owner": owner, "name": name, "items_per_page": items_per_page}
|
|
56
|
+
for page_items in _get_graphql_pages(
|
|
57
|
+
access_token, STARGAZERS_QUERY, variables, "stargazers", max_items
|
|
58
|
+
):
|
|
59
|
+
yield map(
|
|
60
|
+
lambda item: {"starredAt": item["starredAt"], "user": item["node"]},
|
|
61
|
+
page_items,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_reactions_data(
|
|
66
|
+
node_type: str,
|
|
67
|
+
owner: str,
|
|
68
|
+
name: str,
|
|
69
|
+
access_token: str,
|
|
70
|
+
items_per_page: int,
|
|
71
|
+
max_items: Optional[int],
|
|
72
|
+
) -> Iterator[Iterator[StrAny]]:
|
|
73
|
+
variables = {
|
|
74
|
+
"owner": owner,
|
|
75
|
+
"name": name,
|
|
76
|
+
"issues_per_page": items_per_page,
|
|
77
|
+
"first_reactions": 100,
|
|
78
|
+
"first_comments": 100,
|
|
79
|
+
"node_type": node_type,
|
|
80
|
+
}
|
|
81
|
+
for page_items in _get_graphql_pages(
|
|
82
|
+
access_token, ISSUES_QUERY % node_type, variables, node_type, max_items
|
|
83
|
+
):
|
|
84
|
+
# use reactionGroups to query for reactions to comments that have any reactions. reduces cost by 10-50x
|
|
85
|
+
reacted_comment_ids = {}
|
|
86
|
+
for item in page_items:
|
|
87
|
+
for comment in item["comments"]["nodes"]:
|
|
88
|
+
if any(group["createdAt"] for group in comment["reactionGroups"]):
|
|
89
|
+
# print(f"for comment {comment['id']}: has reaction")
|
|
90
|
+
reacted_comment_ids[comment["id"]] = comment
|
|
91
|
+
# if "reactionGroups" in comment:
|
|
92
|
+
comment.pop("reactionGroups", None)
|
|
93
|
+
|
|
94
|
+
# get comment reactions by querying comment nodes separately
|
|
95
|
+
comment_reactions = _get_comment_reaction(
|
|
96
|
+
list(reacted_comment_ids.keys()), access_token
|
|
97
|
+
)
|
|
98
|
+
# attach the reaction nodes where they should be
|
|
99
|
+
for comment in comment_reactions.values():
|
|
100
|
+
comment_id = comment["id"]
|
|
101
|
+
reacted_comment_ids[comment_id]["reactions"] = comment["reactions"]
|
|
102
|
+
yield map(_extract_nested_nodes, page_items)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _extract_top_connection(data: StrAny, node_type: str) -> StrAny:
|
|
106
|
+
assert (
|
|
107
|
+
isinstance(data, dict) and len(data) == 1
|
|
108
|
+
), f"The data with list of {node_type} must be a dictionary and contain only one element"
|
|
109
|
+
data = next(iter(data.values()))
|
|
110
|
+
return data[node_type] # type: ignore
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _extract_nested_nodes(item: DictStrAny) -> DictStrAny:
|
|
114
|
+
"""Recursively moves `nodes` and `totalCount` to reduce nesting."""
|
|
115
|
+
item["reactions_totalCount"] = item["reactions"].get("totalCount", 0)
|
|
116
|
+
item["reactions"] = item["reactions"]["nodes"]
|
|
117
|
+
comments = item["comments"]
|
|
118
|
+
item["comments_totalCount"] = item["comments"].get("totalCount", 0)
|
|
119
|
+
for comment in comments["nodes"]:
|
|
120
|
+
if "reactions" in comment:
|
|
121
|
+
comment["reactions_totalCount"] = comment["reactions"].get("totalCount", 0)
|
|
122
|
+
comment["reactions"] = comment["reactions"]["nodes"]
|
|
123
|
+
item["comments"] = comments["nodes"]
|
|
124
|
+
return item
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _run_graphql_query(
|
|
128
|
+
access_token: str, query: str, variables: DictStrAny
|
|
129
|
+
) -> Tuple[StrAny, StrAny]:
|
|
130
|
+
def _request() -> requests.Response:
|
|
131
|
+
r = requests.post(
|
|
132
|
+
GRAPHQL_API_BASE_URL,
|
|
133
|
+
json={"query": query, "variables": variables},
|
|
134
|
+
headers=_get_auth_header(access_token),
|
|
135
|
+
)
|
|
136
|
+
return r
|
|
137
|
+
|
|
138
|
+
data = _request().json()
|
|
139
|
+
if "errors" in data:
|
|
140
|
+
raise ValueError(data)
|
|
141
|
+
data = data["data"]
|
|
142
|
+
# pop rate limits
|
|
143
|
+
rate_limit = data.pop("rateLimit", {"cost": 0, "remaining": 0})
|
|
144
|
+
return data, rate_limit
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _get_graphql_pages(
|
|
148
|
+
access_token: str, query: str, variables: DictStrAny, node_type: str, max_items: int
|
|
149
|
+
) -> Iterator[List[DictStrAny]]:
|
|
150
|
+
items_count = 0
|
|
151
|
+
while True:
|
|
152
|
+
data, rate_limit = _run_graphql_query(access_token, query, variables)
|
|
153
|
+
top_connection = _extract_top_connection(data, node_type)
|
|
154
|
+
data_items = (
|
|
155
|
+
top_connection["nodes"]
|
|
156
|
+
if "nodes" in top_connection
|
|
157
|
+
else top_connection["edges"]
|
|
158
|
+
)
|
|
159
|
+
items_count += len(data_items)
|
|
160
|
+
print(
|
|
161
|
+
f'Got {len(data_items)}/{items_count} {node_type}s, query cost {rate_limit["cost"]}, remaining credits: {rate_limit["remaining"]}'
|
|
162
|
+
)
|
|
163
|
+
if data_items:
|
|
164
|
+
yield data_items
|
|
165
|
+
else:
|
|
166
|
+
return
|
|
167
|
+
# print(data["repository"][node_type]["pageInfo"]["endCursor"])
|
|
168
|
+
variables["page_after"] = _extract_top_connection(data, node_type)["pageInfo"][
|
|
169
|
+
"endCursor"
|
|
170
|
+
]
|
|
171
|
+
if max_items and items_count >= max_items:
|
|
172
|
+
print(f"Max items limit reached: {items_count} >= {max_items}")
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _get_comment_reaction(comment_ids: List[str], access_token: str) -> StrAny:
|
|
177
|
+
"""Builds a query from a list of comment nodes and returns associated reactions."""
|
|
178
|
+
idx = 0
|
|
179
|
+
data: DictStrAny = {}
|
|
180
|
+
for page_chunk in chunks(comment_ids, 50):
|
|
181
|
+
subs = []
|
|
182
|
+
for comment_id in page_chunk:
|
|
183
|
+
subs.append(COMMENT_REACTIONS_QUERY % (idx, comment_id))
|
|
184
|
+
idx += 1
|
|
185
|
+
subs.append(RATE_LIMIT)
|
|
186
|
+
query = "{" + ",\n".join(subs) + "}"
|
|
187
|
+
# print(query)
|
|
188
|
+
page, rate_limit = _run_graphql_query(access_token, query, {})
|
|
189
|
+
print(
|
|
190
|
+
f'Got {len(page)} comments, query cost {rate_limit["cost"]}, remaining credits: {rate_limit["remaining"]}'
|
|
191
|
+
)
|
|
192
|
+
data.update(page)
|
|
193
|
+
return data
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
RATE_LIMIT = """
|
|
2
|
+
rateLimit {
|
|
3
|
+
limit
|
|
4
|
+
cost
|
|
5
|
+
remaining
|
|
6
|
+
resetAt
|
|
7
|
+
}
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
ISSUES_QUERY = """
|
|
11
|
+
query($owner: String!, $name: String!, $issues_per_page: Int!, $first_reactions: Int!, $first_comments: Int!, $page_after: String) {
|
|
12
|
+
repository(owner: $owner, name: $name) {
|
|
13
|
+
%s(first: $issues_per_page, orderBy: {field: CREATED_AT, direction: DESC}, after: $page_after) {
|
|
14
|
+
totalCount
|
|
15
|
+
pageInfo {
|
|
16
|
+
endCursor
|
|
17
|
+
startCursor
|
|
18
|
+
}
|
|
19
|
+
nodes {
|
|
20
|
+
# id
|
|
21
|
+
number
|
|
22
|
+
url
|
|
23
|
+
title
|
|
24
|
+
body
|
|
25
|
+
author {login avatarUrl url}
|
|
26
|
+
authorAssociation
|
|
27
|
+
closed
|
|
28
|
+
closedAt
|
|
29
|
+
createdAt
|
|
30
|
+
state
|
|
31
|
+
updatedAt
|
|
32
|
+
reactions(first: $first_reactions) {
|
|
33
|
+
totalCount
|
|
34
|
+
nodes {
|
|
35
|
+
# id
|
|
36
|
+
user {login avatarUrl url}
|
|
37
|
+
content
|
|
38
|
+
createdAt
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
comments(first: $first_comments) {
|
|
42
|
+
totalCount
|
|
43
|
+
nodes {
|
|
44
|
+
id
|
|
45
|
+
url
|
|
46
|
+
body
|
|
47
|
+
author {avatarUrl login url}
|
|
48
|
+
authorAssociation
|
|
49
|
+
createdAt
|
|
50
|
+
reactionGroups {content createdAt}
|
|
51
|
+
# reactions(first: 0) {
|
|
52
|
+
# totalCount
|
|
53
|
+
# nodes {
|
|
54
|
+
# # id
|
|
55
|
+
# user {login avatarUrl url}
|
|
56
|
+
# content
|
|
57
|
+
# createdAt
|
|
58
|
+
# }
|
|
59
|
+
# }
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
rateLimit {
|
|
66
|
+
limit
|
|
67
|
+
cost
|
|
68
|
+
remaining
|
|
69
|
+
resetAt
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
COMMENT_REACTIONS_QUERY = """
|
|
75
|
+
node_%s: node(id:"%s") {
|
|
76
|
+
... on IssueComment {
|
|
77
|
+
id
|
|
78
|
+
reactions(first: 100) {
|
|
79
|
+
totalCount
|
|
80
|
+
nodes {
|
|
81
|
+
user {login avatarUrl url}
|
|
82
|
+
content
|
|
83
|
+
createdAt
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
STARGAZERS_QUERY = """
|
|
91
|
+
query($owner: String!, $name: String!, $items_per_page: Int!, $page_after: String) {
|
|
92
|
+
repository(owner: $owner, name: $name) {
|
|
93
|
+
stargazers(first: $items_per_page, orderBy: {field: STARRED_AT, direction: DESC}, after: $page_after) {
|
|
94
|
+
pageInfo {
|
|
95
|
+
endCursor
|
|
96
|
+
startCursor
|
|
97
|
+
}
|
|
98
|
+
edges {
|
|
99
|
+
starredAt
|
|
100
|
+
node {
|
|
101
|
+
login
|
|
102
|
+
avatarUrl
|
|
103
|
+
url
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
rateLimit {
|
|
109
|
+
limit
|
|
110
|
+
cost
|
|
111
|
+
remaining
|
|
112
|
+
resetAt
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
"""
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Github source settings and constants."""
|
|
2
|
+
|
|
3
|
+
START_DATE = "1970-01-01T00:00:00Z"
|
|
4
|
+
|
|
5
|
+
# rest queries
|
|
6
|
+
REST_API_BASE_URL = "https://api.github.com"
|
|
7
|
+
REPO_EVENTS_PATH = "/repos/%s/%s/events"
|
|
8
|
+
|
|
9
|
+
# graphql queries
|
|
10
|
+
GRAPHQL_API_BASE_URL = "https://api.github.com/graphql"
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Defines all the sources and resources needed for Google Analytics V4
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import dlt
|
|
8
|
+
from dlt.common.typing import DictStrAny
|
|
9
|
+
from dlt.sources import DltResource
|
|
10
|
+
from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
|
|
11
|
+
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
12
|
+
|
|
13
|
+
from .helpers import basic_report
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dlt.source(max_table_nesting=0)
|
|
17
|
+
def google_analytics(
|
|
18
|
+
datetime: str,
|
|
19
|
+
credentials: Union[
|
|
20
|
+
GcpOAuthCredentials, GcpServiceAccountCredentials
|
|
21
|
+
] = dlt.secrets.value,
|
|
22
|
+
property_id: int = dlt.config.value,
|
|
23
|
+
queries: List[DictStrAny] = dlt.config.value,
|
|
24
|
+
start_date: Optional[str] = "2015-08-14",
|
|
25
|
+
rows_per_page: int = 10000,
|
|
26
|
+
) -> List[DltResource]:
|
|
27
|
+
try:
|
|
28
|
+
property_id = int(property_id)
|
|
29
|
+
except ValueError:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"{property_id} is an invalid google property id. Please use a numeric id, and not your Measurement ID like G-7F1AE12JLR"
|
|
32
|
+
)
|
|
33
|
+
if property_id == 0:
|
|
34
|
+
raise ValueError(
|
|
35
|
+
"Google Analytics property id is 0. Did you forget to configure it?"
|
|
36
|
+
)
|
|
37
|
+
if not rows_per_page:
|
|
38
|
+
raise ValueError("Rows per page cannot be 0")
|
|
39
|
+
# generate access token for credentials if we are using OAuth2.0
|
|
40
|
+
if isinstance(credentials, GcpOAuthCredentials):
|
|
41
|
+
credentials.auth("https://www.googleapis.com/auth/analytics.readonly")
|
|
42
|
+
|
|
43
|
+
# Build the service object for Google Analytics api.
|
|
44
|
+
client = BetaAnalyticsDataClient(credentials=credentials.to_native_credentials())
|
|
45
|
+
if len(queries) > 1:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
"Google Analytics supports a single query ingestion at a time, please give only one query"
|
|
48
|
+
)
|
|
49
|
+
query = queries[0]
|
|
50
|
+
|
|
51
|
+
# always add "date" to dimensions so we are able to track the last day of a report
|
|
52
|
+
dimensions = query["dimensions"]
|
|
53
|
+
resource_name = query["resource_name"]
|
|
54
|
+
|
|
55
|
+
res = dlt.resource(
|
|
56
|
+
basic_report, name="basic_report", merge_key=datetime, write_disposition="merge"
|
|
57
|
+
)(
|
|
58
|
+
client=client,
|
|
59
|
+
rows_per_page=rows_per_page,
|
|
60
|
+
property_id=property_id,
|
|
61
|
+
dimensions=dimensions,
|
|
62
|
+
metrics=query["metrics"],
|
|
63
|
+
resource_name=resource_name,
|
|
64
|
+
start_date=start_date,
|
|
65
|
+
last_date=dlt.sources.incremental(
|
|
66
|
+
datetime
|
|
67
|
+
), # pass empty primary key to avoid unique checks, a primary key defined by the resource will be used
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return [res]
|