ingestr 0.13.58__py3-none-any.whl → 0.13.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +1 -0
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/factory.py +4 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/sources.py +71 -3
- ingestr/src/stripe_analytics/__init__.py +1 -42
- ingestr/src/stripe_analytics/helpers.py +8 -62
- {ingestr-0.13.58.dist-info → ingestr-0.13.60.dist-info}/METADATA +1 -1
- {ingestr-0.13.58.dist-info → ingestr-0.13.60.dist-info}/RECORD +13 -11
- {ingestr-0.13.58.dist-info → ingestr-0.13.60.dist-info}/WHEEL +0 -0
- {ingestr-0.13.58.dist-info → ingestr-0.13.60.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.58.dist-info → ingestr-0.13.60.dist-info}/licenses/LICENSE.md +0 -0
ingestr/main.py
CHANGED
|
@@ -543,6 +543,7 @@ def ingest(
|
|
|
543
543
|
sql_reflection_level=sql_reflection_level.value,
|
|
544
544
|
sql_limit=sql_limit,
|
|
545
545
|
sql_exclude_columns=sql_exclude_columns,
|
|
546
|
+
extract_parallelism=extract_parallelism,
|
|
546
547
|
)
|
|
547
548
|
|
|
548
549
|
resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))
|
ingestr/src/buildinfo.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
version = "v0.13.
|
|
1
|
+
version = "v0.13.60"
|
ingestr/src/factory.py
CHANGED
|
@@ -42,6 +42,7 @@ from ingestr.src.sources import (
|
|
|
42
42
|
GoogleSheetsSource,
|
|
43
43
|
GorgiasSource,
|
|
44
44
|
HubspotSource,
|
|
45
|
+
IsocPulseSource,
|
|
45
46
|
KafkaSource,
|
|
46
47
|
KinesisSource,
|
|
47
48
|
KlaviyoSource,
|
|
@@ -52,6 +53,7 @@ from ingestr.src.sources import (
|
|
|
52
53
|
NotionSource,
|
|
53
54
|
PersonioSource,
|
|
54
55
|
PhantombusterSource,
|
|
56
|
+
PinterestSource,
|
|
55
57
|
PipedriveSource,
|
|
56
58
|
QuickBooksSource,
|
|
57
59
|
S3Source,
|
|
@@ -172,8 +174,10 @@ class SourceDestinationFactory:
|
|
|
172
174
|
"attio": AttioSource,
|
|
173
175
|
"solidgate": SolidgateSource,
|
|
174
176
|
"quickbooks": QuickBooksSource,
|
|
177
|
+
"isoc-pulse": IsocPulseSource,
|
|
175
178
|
"smartsheet": SmartsheetSource,
|
|
176
179
|
"sftp": SFTPSource,
|
|
180
|
+
"pinterest": PinterestSource,
|
|
177
181
|
}
|
|
178
182
|
destinations: Dict[str, Type[DestinationProtocol]] = {
|
|
179
183
|
"bigquery": BigQueryDestination,
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from dlt.sources.rest_api import RESTAPIConfig, rest_api_resources
|
|
8
|
+
|
|
9
|
+
METRICS: Dict[str, str] = {
|
|
10
|
+
"dnssec_adoption": "dnssec/adoption",
|
|
11
|
+
"dnssec_tld_adoption": "dnssec/adoption",
|
|
12
|
+
"dnssec_validation": "dnssec/validation",
|
|
13
|
+
"http": "http",
|
|
14
|
+
"http3": "http3",
|
|
15
|
+
"https": "https",
|
|
16
|
+
"ipv6": "ipv6",
|
|
17
|
+
"net_loss": "net-loss",
|
|
18
|
+
"resilience": "resilience",
|
|
19
|
+
"roa": "roa",
|
|
20
|
+
"rov": "rov",
|
|
21
|
+
"tls": "tls",
|
|
22
|
+
"tls13": "tls13",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dlt.source
|
|
27
|
+
def pulse_source(
|
|
28
|
+
token: str,
|
|
29
|
+
start_date: str,
|
|
30
|
+
metric: str,
|
|
31
|
+
opts: List[str],
|
|
32
|
+
end_date: Optional[str] = None,
|
|
33
|
+
) -> Iterable[dlt.sources.DltResource]:
|
|
34
|
+
validate(metric, opts)
|
|
35
|
+
cfg = get_metric_cfg(metric, opts, start_date)
|
|
36
|
+
endpoint: Dict[str, Any] = {
|
|
37
|
+
"path": cfg.path,
|
|
38
|
+
"params": {
|
|
39
|
+
"start_date": "{incremental.start_value}",
|
|
40
|
+
**cfg.params,
|
|
41
|
+
},
|
|
42
|
+
"incremental": {
|
|
43
|
+
"cursor_path": "date",
|
|
44
|
+
"start_param": "start_date",
|
|
45
|
+
"end_param": "end_date",
|
|
46
|
+
"initial_value": start_date,
|
|
47
|
+
"end_value": end_date,
|
|
48
|
+
"range_start": "closed",
|
|
49
|
+
"range_end": "closed",
|
|
50
|
+
},
|
|
51
|
+
"paginator": "single_page",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if end_date is not None:
|
|
55
|
+
endpoint["params"]["end_date"] = end_date
|
|
56
|
+
|
|
57
|
+
resources = [
|
|
58
|
+
{
|
|
59
|
+
"name": metric,
|
|
60
|
+
"write_disposition": "merge",
|
|
61
|
+
"primary_key": "date",
|
|
62
|
+
"columns": {"date": {"data_type": "date"}},
|
|
63
|
+
"endpoint": endpoint,
|
|
64
|
+
}
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
config: RESTAPIConfig = {
|
|
68
|
+
"client": {
|
|
69
|
+
"base_url": "https://pulse.internetsociety.org/api/",
|
|
70
|
+
"headers": {"Authorization": f"Bearer {token}"},
|
|
71
|
+
},
|
|
72
|
+
"resource_defaults": {
|
|
73
|
+
"write_disposition": "merge",
|
|
74
|
+
"primary_key": "date",
|
|
75
|
+
},
|
|
76
|
+
"resources": resources, # type:ignore
|
|
77
|
+
}
|
|
78
|
+
res = rest_api_resources(config)
|
|
79
|
+
if metric == "net_loss":
|
|
80
|
+
res[0].add_map(add_date(start_date))
|
|
81
|
+
yield from res
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class MetricCfg:
|
|
86
|
+
path: str
|
|
87
|
+
params: Dict[str, Any]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_metric_cfg(metric: str, opts: List[str], start_date: str) -> MetricCfg:
|
|
91
|
+
path = METRICS.get(metric)
|
|
92
|
+
if path is None:
|
|
93
|
+
raise ValueError(f"Unknown metric '{metric}'.")
|
|
94
|
+
if len(opts) == 0:
|
|
95
|
+
return MetricCfg(path=path, params={})
|
|
96
|
+
|
|
97
|
+
if metric == "https":
|
|
98
|
+
return MetricCfg(
|
|
99
|
+
path=f"{path}/country/{opts[-1]}",
|
|
100
|
+
params={
|
|
101
|
+
"topsites": True if "topsites" in opts else False,
|
|
102
|
+
},
|
|
103
|
+
)
|
|
104
|
+
elif metric in ["dnssec_validation", "dnssec_tld_adoption"]:
|
|
105
|
+
return MetricCfg(path=f"{path}/country/{opts[-1]}", params={})
|
|
106
|
+
elif metric == "dnssec_adoption":
|
|
107
|
+
return MetricCfg(path=f"{path}/domains/{opts[-1]}", params={})
|
|
108
|
+
elif metric == "ipv6":
|
|
109
|
+
if "topsites" in opts:
|
|
110
|
+
return MetricCfg(path=path, params={"topsites": True})
|
|
111
|
+
return MetricCfg(path=f"{path}/country/{opts[-1]}", params={})
|
|
112
|
+
elif metric == "roa":
|
|
113
|
+
if len(opts) > 1:
|
|
114
|
+
return MetricCfg(
|
|
115
|
+
path=f"{path}/country/{opts[-1]}", params={"ip_version": opts[-2]}
|
|
116
|
+
)
|
|
117
|
+
return MetricCfg(path=path, params={"ip_version": opts[-1]})
|
|
118
|
+
elif metric == "net_loss":
|
|
119
|
+
return MetricCfg(
|
|
120
|
+
path=path,
|
|
121
|
+
params={
|
|
122
|
+
"country": opts[-1],
|
|
123
|
+
"shutdown_type": opts[-2],
|
|
124
|
+
},
|
|
125
|
+
)
|
|
126
|
+
elif metric == "resilience":
|
|
127
|
+
date = datetime.strptime(start_date, "%Y-%m-%d")
|
|
128
|
+
return MetricCfg(
|
|
129
|
+
path=path,
|
|
130
|
+
params={
|
|
131
|
+
"country": opts[-1],
|
|
132
|
+
"year": date.year,
|
|
133
|
+
"quarter": math.floor(date.month / 4) + 1,
|
|
134
|
+
},
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"Unsupported metric '{metric}' with options {opts}. "
|
|
139
|
+
"Please check the metric and options."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def add_date(start_date: str):
|
|
144
|
+
def transform(item: dict):
|
|
145
|
+
item["date"] = start_date
|
|
146
|
+
return item
|
|
147
|
+
|
|
148
|
+
return transform
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def validate(metric: str, opts: List[str]) -> None:
|
|
152
|
+
nopts = len(opts)
|
|
153
|
+
if metric == "net_loss" and nopts != 2:
|
|
154
|
+
raise ValueError(
|
|
155
|
+
"For 'net_loss' metric, two options are required: "
|
|
156
|
+
"'shutdown_type' and 'country'."
|
|
157
|
+
)
|
|
158
|
+
if nopts > 0 and metric in ["http", "http3", "tls", "tls13", "rov"]:
|
|
159
|
+
raise ValueError(f"metric '{metric}' does not support options. ")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Iterable
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
import pendulum
|
|
5
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
6
|
+
from dlt.common.typing import TDataItem
|
|
7
|
+
from dlt.sources import DltResource
|
|
8
|
+
from dlt.sources.helpers import requests
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dlt.source(name="pinterest", max_table_nesting=0)
|
|
12
|
+
def pinterest_source(
|
|
13
|
+
start_date: pendulum.DateTime,
|
|
14
|
+
access_token: str,
|
|
15
|
+
page_size: int = 200,
|
|
16
|
+
end_date: pendulum.DateTime | None = None,
|
|
17
|
+
) -> Iterable[DltResource]:
|
|
18
|
+
session = requests.Session()
|
|
19
|
+
session.headers.update({"Authorization": f"Bearer {access_token}"})
|
|
20
|
+
base_url = "https://api.pinterest.com/v5"
|
|
21
|
+
|
|
22
|
+
def fetch_data(
|
|
23
|
+
endpoint: str,
|
|
24
|
+
start_dt: pendulum.DateTime,
|
|
25
|
+
end_dt: pendulum.DateTime,
|
|
26
|
+
) -> Iterable[TDataItem]:
|
|
27
|
+
url = f"{base_url}/{endpoint}"
|
|
28
|
+
params = {"page_size": page_size}
|
|
29
|
+
bookmark = None
|
|
30
|
+
while True:
|
|
31
|
+
if bookmark:
|
|
32
|
+
params["bookmark"] = bookmark
|
|
33
|
+
|
|
34
|
+
resp = session.get(url, params=params)
|
|
35
|
+
resp.raise_for_status()
|
|
36
|
+
data = resp.json()
|
|
37
|
+
items = data.get("items") or []
|
|
38
|
+
|
|
39
|
+
for item in items:
|
|
40
|
+
item_created = ensure_pendulum_datetime(item["created_at"])
|
|
41
|
+
if item_created <= start_dt:
|
|
42
|
+
continue
|
|
43
|
+
if item_created > end_dt:
|
|
44
|
+
continue
|
|
45
|
+
item["created_at"] = item_created
|
|
46
|
+
yield item
|
|
47
|
+
|
|
48
|
+
bookmark = data.get("bookmark")
|
|
49
|
+
if not bookmark:
|
|
50
|
+
break
|
|
51
|
+
|
|
52
|
+
@dlt.resource(write_disposition="merge", primary_key="id")
|
|
53
|
+
def pins(
|
|
54
|
+
datetime=dlt.sources.incremental(
|
|
55
|
+
"created_at",
|
|
56
|
+
initial_value=start_date,
|
|
57
|
+
end_value=end_date,
|
|
58
|
+
),
|
|
59
|
+
) -> Iterable[TDataItem]:
|
|
60
|
+
_start_date = datetime.last_value or start_date
|
|
61
|
+
if end_date is None:
|
|
62
|
+
_end_date = pendulum.now("UTC")
|
|
63
|
+
else:
|
|
64
|
+
_end_date = datetime.end_value
|
|
65
|
+
yield from fetch_data("pins", _start_date, _end_date)
|
|
66
|
+
|
|
67
|
+
@dlt.resource(write_disposition="merge", primary_key="id")
|
|
68
|
+
def boards(
|
|
69
|
+
datetime=dlt.sources.incremental(
|
|
70
|
+
"created_at",
|
|
71
|
+
initial_value=start_date,
|
|
72
|
+
end_value=end_date,
|
|
73
|
+
),
|
|
74
|
+
) -> Iterable[TDataItem]:
|
|
75
|
+
_start_date = datetime.last_value or start_date
|
|
76
|
+
if end_date is None:
|
|
77
|
+
_end_date = pendulum.now("UTC")
|
|
78
|
+
else:
|
|
79
|
+
_end_date = datetime.end_value
|
|
80
|
+
yield from fetch_data("boards", _start_date, _end_date)
|
|
81
|
+
|
|
82
|
+
return pins, boards
|
ingestr/src/sources.py
CHANGED
|
@@ -699,9 +699,7 @@ class StripeAnalyticsSource:
|
|
|
699
699
|
)
|
|
700
700
|
|
|
701
701
|
if incremental and not sync:
|
|
702
|
-
raise ValueError(
|
|
703
|
-
"incremental loads must be used with sync loading"
|
|
704
|
-
)
|
|
702
|
+
raise ValueError("incremental loads must be used with sync loading")
|
|
705
703
|
|
|
706
704
|
if incremental:
|
|
707
705
|
from ingestr.src.stripe_analytics import incremental_stripe_source
|
|
@@ -739,6 +737,7 @@ class StripeAnalyticsSource:
|
|
|
739
737
|
endpoint,
|
|
740
738
|
],
|
|
741
739
|
stripe_secret_key=api_key[0],
|
|
740
|
+
max_workers=kwargs.get("extract_parallelism", 4),
|
|
742
741
|
).with_resources(endpoint)
|
|
743
742
|
|
|
744
743
|
raise ValueError(
|
|
@@ -2783,3 +2782,72 @@ class QuickBooksSource:
|
|
|
2783
2782
|
minor_version=minor_version[0],
|
|
2784
2783
|
object=table_name,
|
|
2785
2784
|
).with_resources(table_name)
|
|
2785
|
+
|
|
2786
|
+
|
|
2787
|
+
class IsocPulseSource:
|
|
2788
|
+
def handles_incrementality(self) -> bool:
|
|
2789
|
+
return True
|
|
2790
|
+
|
|
2791
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2792
|
+
parsed_uri = urlparse(uri)
|
|
2793
|
+
params = parse_qs(parsed_uri.query)
|
|
2794
|
+
token = params.get("token")
|
|
2795
|
+
if not token or not token[0].strip():
|
|
2796
|
+
raise MissingValueError("token", "Internet Society Pulse")
|
|
2797
|
+
|
|
2798
|
+
start_date = kwargs.get("interval_start")
|
|
2799
|
+
if start_date is None:
|
|
2800
|
+
start_date = pendulum.now().in_tz("UTC").subtract(days=30)
|
|
2801
|
+
|
|
2802
|
+
end_date = kwargs.get("interval_end")
|
|
2803
|
+
|
|
2804
|
+
metric = table
|
|
2805
|
+
opts = []
|
|
2806
|
+
if ":" in metric:
|
|
2807
|
+
metric, *opts = metric.strip().split(":")
|
|
2808
|
+
opts = [opt.strip() for opt in opts]
|
|
2809
|
+
|
|
2810
|
+
from ingestr.src.isoc_pulse import pulse_source
|
|
2811
|
+
|
|
2812
|
+
src = pulse_source(
|
|
2813
|
+
token=token[0],
|
|
2814
|
+
start_date=start_date.strftime("%Y-%m-%d"),
|
|
2815
|
+
end_date=str(end_date) if end_date else None,
|
|
2816
|
+
metric=metric,
|
|
2817
|
+
opts=opts,
|
|
2818
|
+
)
|
|
2819
|
+
return src.with_resources(metric)
|
|
2820
|
+
|
|
2821
|
+
|
|
2822
|
+
class PinterestSource:
|
|
2823
|
+
def handles_incrementality(self) -> bool:
|
|
2824
|
+
return True
|
|
2825
|
+
|
|
2826
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2827
|
+
parsed = urlparse(uri)
|
|
2828
|
+
params = parse_qs(parsed.query)
|
|
2829
|
+
access_token = params.get("access_token")
|
|
2830
|
+
|
|
2831
|
+
if not access_token:
|
|
2832
|
+
raise MissingValueError("access_token", "Pinterest")
|
|
2833
|
+
|
|
2834
|
+
start_date = kwargs.get("interval_start")
|
|
2835
|
+
if start_date is not None:
|
|
2836
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
2837
|
+
else:
|
|
2838
|
+
start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
|
|
2839
|
+
|
|
2840
|
+
end_date = kwargs.get("interval_end")
|
|
2841
|
+
if end_date is not None:
|
|
2842
|
+
end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
2843
|
+
|
|
2844
|
+
from ingestr.src.pinterest import pinterest_source
|
|
2845
|
+
|
|
2846
|
+
if table not in {"pins", "boards"}:
|
|
2847
|
+
raise UnsupportedResourceError(table, "Pinterest")
|
|
2848
|
+
|
|
2849
|
+
return pinterest_source(
|
|
2850
|
+
access_token=access_token[0],
|
|
2851
|
+
start_date=start_date,
|
|
2852
|
+
end_date=end_date,
|
|
2853
|
+
).with_resources(table)
|
|
@@ -10,7 +10,6 @@ from pendulum import DateTime
|
|
|
10
10
|
from .helpers import (
|
|
11
11
|
async_parallel_pagination,
|
|
12
12
|
pagination,
|
|
13
|
-
parallel_pagination,
|
|
14
13
|
transform_date,
|
|
15
14
|
)
|
|
16
15
|
|
|
@@ -55,53 +54,13 @@ def stripe_source(
|
|
|
55
54
|
)(endpoint)
|
|
56
55
|
|
|
57
56
|
|
|
58
|
-
@dlt.source(max_table_nesting=0)
|
|
59
|
-
def parallel_stripe_source(
|
|
60
|
-
endpoints: Tuple[str, ...],
|
|
61
|
-
stripe_secret_key: str = dlt.secrets.value,
|
|
62
|
-
start_date: Optional[DateTime] = None,
|
|
63
|
-
end_date: Optional[DateTime] = None,
|
|
64
|
-
max_workers: int = 12,
|
|
65
|
-
) -> Iterable[DltResource]:
|
|
66
|
-
"""
|
|
67
|
-
Retrieves data from the Stripe API for the specified endpoints using parallel pagination.
|
|
68
|
-
|
|
69
|
-
This source divides the date range across multiple workers to fetch data in parallel,
|
|
70
|
-
which can significantly speed up data retrieval for large date ranges.
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from.
|
|
74
|
-
stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
|
|
75
|
-
start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Required for parallel processing.
|
|
76
|
-
end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Required for parallel processing.
|
|
77
|
-
max_workers (int): Maximum number of worker threads for parallel fetching. Defaults to 4.
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
Iterable[DltResource]: Resources with data that was created during the period greater than or equal to 'start_date' and less than 'end_date'.
|
|
81
|
-
"""
|
|
82
|
-
stripe.api_key = stripe_secret_key
|
|
83
|
-
stripe.api_version = "2022-11-15"
|
|
84
|
-
|
|
85
|
-
def parallel_stripe_resource(
|
|
86
|
-
endpoint: str,
|
|
87
|
-
) -> Generator[Dict[Any, Any], Any, None]:
|
|
88
|
-
yield from parallel_pagination(endpoint, start_date, end_date, max_workers)
|
|
89
|
-
|
|
90
|
-
for endpoint in endpoints:
|
|
91
|
-
yield dlt.resource(
|
|
92
|
-
parallel_stripe_resource,
|
|
93
|
-
name=endpoint,
|
|
94
|
-
write_disposition="replace",
|
|
95
|
-
)(endpoint)
|
|
96
|
-
|
|
97
|
-
|
|
98
57
|
@dlt.source(max_table_nesting=0)
|
|
99
58
|
def async_stripe_source(
|
|
100
59
|
endpoints: Tuple[str, ...],
|
|
101
60
|
stripe_secret_key: str = dlt.secrets.value,
|
|
102
61
|
start_date: Optional[DateTime] = None,
|
|
103
62
|
end_date: Optional[DateTime] = None,
|
|
104
|
-
max_workers: int =
|
|
63
|
+
max_workers: int = 4,
|
|
105
64
|
rate_limit_delay: float = 0.03,
|
|
106
65
|
) -> Iterable[DltResource]:
|
|
107
66
|
"""
|
|
@@ -43,67 +43,6 @@ def pagination(
|
|
|
43
43
|
break
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
def parallel_pagination(
|
|
47
|
-
endpoint: str,
|
|
48
|
-
start_date: Optional[Any] = None,
|
|
49
|
-
end_date: Optional[Any] = None,
|
|
50
|
-
max_workers: int = 4,
|
|
51
|
-
) -> Iterable[TDataItem]:
|
|
52
|
-
"""
|
|
53
|
-
Retrieves data from an endpoint with parallel pagination by dividing date ranges across workers.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
endpoint (str): The endpoint to retrieve data from.
|
|
57
|
-
start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to 2010-01-01 if None.
|
|
58
|
-
end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to today if None.
|
|
59
|
-
max_workers (int): Maximum number of worker threads to use for parallel fetching. Defaults to 4.
|
|
60
|
-
|
|
61
|
-
Returns:
|
|
62
|
-
Iterable[TDataItem]: Data items retrieved from the endpoint.
|
|
63
|
-
"""
|
|
64
|
-
# Set default date range if not provided: 2010 to today
|
|
65
|
-
if not start_date:
|
|
66
|
-
start_date = pendulum.datetime(2010, 1, 1)
|
|
67
|
-
if not end_date:
|
|
68
|
-
end_date = pendulum.now()
|
|
69
|
-
|
|
70
|
-
# Convert dates to timestamps for processing
|
|
71
|
-
start_ts = transform_date(start_date)
|
|
72
|
-
end_ts = transform_date(end_date)
|
|
73
|
-
|
|
74
|
-
# If date range is very small, use sequential pagination
|
|
75
|
-
date_range_days = (end_ts - start_ts) / (24 * 60 * 60)
|
|
76
|
-
if date_range_days < 30: # Less than 30 days
|
|
77
|
-
yield from pagination(endpoint, start_date, end_date)
|
|
78
|
-
return
|
|
79
|
-
|
|
80
|
-
# Create time chunks with larger chunks for 2010s (less data expected)
|
|
81
|
-
time_chunks = _create_adaptive_time_chunks(start_ts, end_ts, max_workers)
|
|
82
|
-
|
|
83
|
-
# Use ThreadPoolExecutor to fetch data in parallel and yield as soon as ready
|
|
84
|
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
85
|
-
# Submit all tasks
|
|
86
|
-
future_to_chunk = {
|
|
87
|
-
executor.submit(
|
|
88
|
-
_fetch_chunk_data_streaming, endpoint, chunk_start, chunk_end
|
|
89
|
-
): (chunk_start, chunk_end)
|
|
90
|
-
for chunk_start, chunk_end in time_chunks
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
# MAXIMUM SPEED - Yield results immediately as they complete
|
|
94
|
-
for future in as_completed(future_to_chunk):
|
|
95
|
-
chunk_start, chunk_end = future_to_chunk[future]
|
|
96
|
-
try:
|
|
97
|
-
chunk_data = future.result()
|
|
98
|
-
# Yield all batches from this chunk immediately - NO ORDERING
|
|
99
|
-
for batch in chunk_data:
|
|
100
|
-
yield batch
|
|
101
|
-
|
|
102
|
-
except Exception as exc:
|
|
103
|
-
print(f"Chunk {chunk_start}-{chunk_end} generated an exception: {exc}")
|
|
104
|
-
raise exc
|
|
105
|
-
|
|
106
|
-
|
|
107
46
|
def _create_time_chunks(start_ts: int, end_ts: int, num_chunks: int) -> List[tuple]:
|
|
108
47
|
"""
|
|
109
48
|
Divide a time range into equal chunks for parallel processing.
|
|
@@ -295,7 +234,6 @@ async def async_parallel_pagination(
|
|
|
295
234
|
|
|
296
235
|
async def fetch_chunk_with_semaphore(chunk_start: int, chunk_end: int):
|
|
297
236
|
async with semaphore:
|
|
298
|
-
await asyncio.sleep(rate_limit_delay)
|
|
299
237
|
return await _fetch_chunk_data_async_fast(endpoint, chunk_start, chunk_end)
|
|
300
238
|
|
|
301
239
|
# Create all tasks
|
|
@@ -390,6 +328,10 @@ async def stripe_get_data_async(
|
|
|
390
328
|
max_wait_time_ms = 10000
|
|
391
329
|
|
|
392
330
|
while retry_count < max_retries:
|
|
331
|
+
# print(
|
|
332
|
+
# f"Fetching {resource} from {datetime.fromtimestamp(start_date).strftime('%Y-%m-%d %H:%M:%S') if start_date else 'None'} to {datetime.fromtimestamp(end_date).strftime('%Y-%m-%d %H:%M:%S') if end_date else 'None'}, retry {retry_count} of {max_retries}",
|
|
333
|
+
# flush=True,
|
|
334
|
+
# )
|
|
393
335
|
try:
|
|
394
336
|
resource_dict = await getattr(stripe, resource).list_async(
|
|
395
337
|
created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
|
|
@@ -399,6 +341,10 @@ async def stripe_get_data_async(
|
|
|
399
341
|
retry_count += 1
|
|
400
342
|
if retry_count < max_retries:
|
|
401
343
|
wait_time = min(2**retry_count * 0.001, max_wait_time_ms)
|
|
344
|
+
print(
|
|
345
|
+
f"Got rate limited, sleeping {wait_time} seconds before retrying...",
|
|
346
|
+
flush=True,
|
|
347
|
+
)
|
|
402
348
|
await asyncio.sleep(wait_time)
|
|
403
349
|
else:
|
|
404
350
|
# Re-raise the last exception if we've exhausted retries
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.60
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
|
|
2
|
-
ingestr/main.py,sha256=
|
|
2
|
+
ingestr/main.py,sha256=taDyHyaVSpB17iNLl8zA0gmr4CqDO-MSTQX1CaRBB9U,26364
|
|
3
3
|
ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
|
|
4
4
|
ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
|
|
5
|
-
ingestr/src/buildinfo.py,sha256=
|
|
5
|
+
ingestr/src/buildinfo.py,sha256=1sTup4WLO36DuLnh5cnxtmEDBjKKYxAOSisEvjELy1w,21
|
|
6
6
|
ingestr/src/destinations.py,sha256=TcxM2rcwHfgMMP6U0yRNcfWKlEzkBbZbqCIDww7lkTY,16882
|
|
7
7
|
ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
|
|
8
|
-
ingestr/src/factory.py,sha256=
|
|
8
|
+
ingestr/src/factory.py,sha256=OKqjYqvHhgaOF48-eSNSabcfXt4Gmr1yZ8cFGizXh0g,6319
|
|
9
9
|
ingestr/src/filters.py,sha256=LLecXe9QkLFkFLUZ92OXNdcANr1a8edDxrflc2ko_KA,1452
|
|
10
10
|
ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,732
|
|
11
11
|
ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
|
|
12
12
|
ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
|
|
13
13
|
ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
|
|
14
|
-
ingestr/src/sources.py,sha256=
|
|
14
|
+
ingestr/src/sources.py,sha256=sJmiiInFb-KCPsaIy4qus6lx59MDCOobWgxJ7lfKH08,99047
|
|
15
15
|
ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
|
|
16
16
|
ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
|
|
17
17
|
ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
|
|
@@ -73,6 +73,7 @@ ingestr/src/gorgias/helpers.py,sha256=DamuijnvhGY9hysQO4txrVMf4izkGbh5qfBKImdOIN
|
|
|
73
73
|
ingestr/src/hubspot/__init__.py,sha256=wqHefhc_YRI5dNFCcpvH-UUilNThE49sbGouSBiHYsw,11776
|
|
74
74
|
ingestr/src/hubspot/helpers.py,sha256=k2b-lhxqBNKHoOSHoHegFSsk8xxjjGA0I04V0XyX2b4,7883
|
|
75
75
|
ingestr/src/hubspot/settings.py,sha256=i73MkSiJfRLMFLfiJgYdhp-rhymHTfoqFzZ4uOJdFJM,2456
|
|
76
|
+
ingestr/src/isoc_pulse/__init__.py,sha256=WDgKBn15gyQheXE6oJ_2OuMUQwKPbAjflKAsnucu7u8,4647
|
|
76
77
|
ingestr/src/kafka/__init__.py,sha256=wMCXdiraeKd1Kssi9WcVCGZaNGm2tJEtnNyuB4aR5_k,3541
|
|
77
78
|
ingestr/src/kafka/helpers.py,sha256=V9WcVn3PKnEpggArHda4vnAcaV8VDuh__dSmRviJb5Y,7502
|
|
78
79
|
ingestr/src/kinesis/__init__.py,sha256=YretSz4F28tbkcPhd55mBp2Xk7XE9unyWx0nmvl8iEc,6235
|
|
@@ -96,6 +97,7 @@ ingestr/src/personio/__init__.py,sha256=sHYpoV-rg-kA1YsflctChis0hKcTrL6mka9O0CHV
|
|
|
96
97
|
ingestr/src/personio/helpers.py,sha256=EKmBN0Lf4R0lc3yqqs7D-RjoZ75E8gPcctt59xwHxrY,2901
|
|
97
98
|
ingestr/src/phantombuster/__init__.py,sha256=8AQTiA8fp1NT8TellQQqwBCl6vGvGwUBLif6LIzgAik,1786
|
|
98
99
|
ingestr/src/phantombuster/client.py,sha256=9zx58sFunXjUNh6jeEYLNfwNxGxX9odifwAmS0E9AaY,3018
|
|
100
|
+
ingestr/src/pinterest/__init__.py,sha256=5xTLNE2Vn_00PXMLKjY41Fh1LsyzM7UnBSKxKPITUl0,2581
|
|
99
101
|
ingestr/src/pipedrive/__init__.py,sha256=iRrxeMwo8_83ptgGnTFTNHV1nYvIsFfg0a3XzugPYeI,6982
|
|
100
102
|
ingestr/src/pipedrive/settings.py,sha256=q119Fy4C5Ip1rMoCILX2BkHV3bwiXC_dW58KIiDUzsY,708
|
|
101
103
|
ingestr/src/pipedrive/typing.py,sha256=lEMXu4hhAA3XkhVSlBUa-juqyupisd3c-qSQKxFvzoE,69
|
|
@@ -117,8 +119,8 @@ ingestr/src/solidgate/__init__.py,sha256=JdaXvAu5QGuf9-FY294vwCQCEmfrqIld9oqbzqC
|
|
|
117
119
|
ingestr/src/solidgate/helpers.py,sha256=oePEc9nnvmN3IaKrfJCvyKL79xdGM0-gRTN3-8tY4Fc,4952
|
|
118
120
|
ingestr/src/sql_database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
119
121
|
ingestr/src/sql_database/callbacks.py,sha256=sEFFmXxAURY3yeBjnawigDtq9LBCvi8HFqG4kLd7tMU,2002
|
|
120
|
-
ingestr/src/stripe_analytics/__init__.py,sha256=
|
|
121
|
-
ingestr/src/stripe_analytics/helpers.py,sha256=
|
|
122
|
+
ingestr/src/stripe_analytics/__init__.py,sha256=mK8dGKAlMPVqGE9gG30XfbvOvgVD0yWhNpt-D3iavDY,6385
|
|
123
|
+
ingestr/src/stripe_analytics/helpers.py,sha256=O5ow8xORcyLhw1Yn6vFm__tASfmPOgR0TMVU9gXmxcE,11828
|
|
122
124
|
ingestr/src/stripe_analytics/settings.py,sha256=xt1-ljwP4nLTNUa8l3KwFbtK8FtQHgHpzGF5uPKfRsw,2246
|
|
123
125
|
ingestr/src/telemetry/event.py,sha256=W7bs4uVfPakQ5otmiqgqu1l5SqjYx1p87wudnWXckBc,949
|
|
124
126
|
ingestr/src/testdata/fakebqcredentials.json,sha256=scc6TUc963KAbKTLZCfcmqVzbtzDCW1_8JNRnyAXyy8,628
|
|
@@ -141,8 +143,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
|
|
|
141
143
|
ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
|
|
142
144
|
ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
|
|
143
145
|
ingestr/tests/unit/test_smartsheets.py,sha256=eiC2CCO4iNJcuN36ONvqmEDryCA1bA1REpayHpu42lk,5058
|
|
144
|
-
ingestr-0.13.
|
|
145
|
-
ingestr-0.13.
|
|
146
|
-
ingestr-0.13.
|
|
147
|
-
ingestr-0.13.
|
|
148
|
-
ingestr-0.13.
|
|
146
|
+
ingestr-0.13.60.dist-info/METADATA,sha256=FwdcfGIPPRKlSV8wJX1HAqHriGUZBl_XXi0Yco8O874,14993
|
|
147
|
+
ingestr-0.13.60.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
148
|
+
ingestr-0.13.60.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
|
|
149
|
+
ingestr-0.13.60.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
|
|
150
|
+
ingestr-0.13.60.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|