ingestr 0.12.5__py3-none-any.whl → 0.12.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main.py CHANGED
@@ -444,7 +444,7 @@ def ingest(
444
444
 
445
445
  progressInstance: Collector = SpinnerCollector()
446
446
  if progress == Progress.log:
447
- progressInstance = LogCollector(dump_system_stats=False)
447
+ progressInstance = LogCollector()
448
448
 
449
449
  is_pipelines_dir_temp = False
450
450
  if pipelines_dir is None:
@@ -0,0 +1,137 @@
1
+ import csv
2
+ import gzip
3
+ import os
4
+ import tempfile
5
+ from copy import deepcopy
6
+ from datetime import datetime
7
+ from typing import Iterable, List, Optional
8
+
9
+ import dlt
10
+ import requests
11
+ from dlt.common.typing import TDataItem
12
+ from dlt.sources import DltResource
13
+
14
+ from .client import AppStoreConnectClientInterface
15
+ from .errors import (
16
+ NoOngoingReportRequestsFoundError,
17
+ NoReportsFoundError,
18
+ NoSuchReportError,
19
+ )
20
+ from .models import AnalyticsReportInstancesResponse
21
+ from .resources import RESOURCES
22
+
23
+
24
+ @dlt.source
25
+ def app_store(
26
+ client: AppStoreConnectClientInterface,
27
+ app_ids: List[str],
28
+ start_date: Optional[datetime] = None,
29
+ end_date: Optional[datetime] = None,
30
+ ) -> Iterable[DltResource]:
31
+ for resource in RESOURCES:
32
+ yield dlt.resource(
33
+ get_analytics_reports,
34
+ name=resource.name,
35
+ primary_key=resource.primary_key,
36
+ columns=resource.columns,
37
+ )(client, app_ids, resource.report_name, start_date, end_date)
38
+
39
+
40
+ def filter_instances_by_date(
41
+ instances: AnalyticsReportInstancesResponse,
42
+ start_date: Optional[datetime],
43
+ end_date: Optional[datetime],
44
+ ) -> AnalyticsReportInstancesResponse:
45
+ instances = deepcopy(instances)
46
+ if start_date is not None:
47
+ instances.data = list(
48
+ filter(
49
+ lambda x: datetime.fromisoformat(x.attributes.processingDate)
50
+ >= start_date,
51
+ instances.data,
52
+ )
53
+ )
54
+ if end_date is not None:
55
+ instances.data = list(
56
+ filter(
57
+ lambda x: datetime.fromisoformat(x.attributes.processingDate)
58
+ <= end_date,
59
+ instances.data,
60
+ )
61
+ )
62
+
63
+ return instances
64
+
65
+
66
+ def get_analytics_reports(
67
+ client: AppStoreConnectClientInterface,
68
+ app_ids: List[str],
69
+ report_name: str,
70
+ start_date: Optional[datetime],
71
+ end_date: Optional[datetime],
72
+ last_processing_date=dlt.sources.incremental("processing_date"),
73
+ ) -> Iterable[TDataItem]:
74
+ if last_processing_date.last_value:
75
+ start_date = datetime.fromisoformat(last_processing_date.last_value)
76
+ for app_id in app_ids:
77
+ yield from get_report(client, app_id, report_name, start_date, end_date)
78
+
79
+
80
+ def get_report(
81
+ client: AppStoreConnectClientInterface,
82
+ app_id: str,
83
+ report_name: str,
84
+ start_date: Optional[datetime],
85
+ end_date: Optional[datetime],
86
+ ) -> Iterable[TDataItem]:
87
+ report_requests = client.list_analytics_report_requests(app_id)
88
+ ongoing_requests = list(
89
+ filter(
90
+ lambda x: x.attributes.accessType == "ONGOING"
91
+ and not x.attributes.stoppedDueToInactivity,
92
+ report_requests.data,
93
+ )
94
+ )
95
+
96
+ if len(ongoing_requests) == 0:
97
+ raise NoOngoingReportRequestsFoundError()
98
+
99
+ reports = client.list_analytics_reports(ongoing_requests[0].id, report_name)
100
+ if len(reports.data) == 0:
101
+ raise NoSuchReportError(report_name)
102
+
103
+ for report in reports.data:
104
+ instances = client.list_report_instances(report.id)
105
+
106
+ instances = filter_instances_by_date(instances, start_date, end_date)
107
+
108
+ if len(instances.data) == 0:
109
+ raise NoReportsFoundError()
110
+
111
+ for instance in instances.data:
112
+ segments = client.list_report_segments(instance.id)
113
+ with tempfile.TemporaryDirectory() as temp_dir:
114
+ files = []
115
+ for segment in segments.data:
116
+ payload = requests.get(segment.attributes.url, stream=True)
117
+ payload.raise_for_status()
118
+
119
+ csv_path = os.path.join(
120
+ temp_dir, f"{segment.attributes.checksum}.csv"
121
+ )
122
+ with open(csv_path, "wb") as f:
123
+ for chunk in payload.iter_content(chunk_size=8192):
124
+ f.write(chunk)
125
+ files.append(csv_path)
126
+ for file in files:
127
+ with gzip.open(file, "rt") as f:
128
+ # TODO: infer delimiter from the file itself
129
+ delimiter = (
130
+ "," if report_name == "App Crashes Expanded" else "\t"
131
+ )
132
+ reader = csv.DictReader(f, delimiter=delimiter)
133
+ for row in reader:
134
+ yield {
135
+ "processing_date": instance.attributes.processingDate,
136
+ **row,
137
+ }
@@ -0,0 +1,126 @@
1
+ import abc
2
+ import time
3
+ from typing import Optional
4
+
5
+ import jwt
6
+ import requests
7
+ from requests.models import PreparedRequest
8
+
9
+ from .models import (
10
+ AnalyticsReportInstancesResponse,
11
+ AnalyticsReportRequestsResponse,
12
+ AnalyticsReportResponse,
13
+ AnalyticsReportSegmentsResponse,
14
+ )
15
+
16
+
17
+ class AppStoreConnectClientInterface(abc.ABC):
18
+ @abc.abstractmethod
19
+ def list_analytics_report_requests(self, app_id) -> AnalyticsReportRequestsResponse:
20
+ pass
21
+
22
+ @abc.abstractmethod
23
+ def list_analytics_reports(
24
+ self, req_id: str, report_name: str
25
+ ) -> AnalyticsReportResponse:
26
+ pass
27
+
28
+ @abc.abstractmethod
29
+ def list_report_instances(
30
+ self,
31
+ report_id: str,
32
+ granularity: str = "DAILY",
33
+ ) -> AnalyticsReportInstancesResponse:
34
+ pass
35
+
36
+ @abc.abstractmethod
37
+ def list_report_segments(self, instance_id: str) -> AnalyticsReportSegmentsResponse:
38
+ pass
39
+
40
+
41
+ class AppStoreConnectClient(AppStoreConnectClientInterface):
42
+ def __init__(self, key: bytes, key_id: str, issuer_id: str):
43
+ self.__key = key
44
+ self.__key_id = key_id
45
+ self.__issuer_id = issuer_id
46
+
47
+ def list_analytics_report_requests(self, app_id) -> AnalyticsReportRequestsResponse:
48
+ res = requests.get(
49
+ f"https://api.appstoreconnect.apple.com/v1/apps/{app_id}/analyticsReportRequests",
50
+ auth=self.auth,
51
+ )
52
+ res.raise_for_status()
53
+
54
+ return AnalyticsReportRequestsResponse.from_json(res.text) # type: ignore
55
+
56
+ def list_analytics_reports(
57
+ self, req_id: str, report_name: str
58
+ ) -> AnalyticsReportResponse:
59
+ params = {"filter[name]": report_name}
60
+ res = requests.get(
61
+ f"https://api.appstoreconnect.apple.com/v1/analyticsReportRequests/{req_id}/reports",
62
+ auth=self.auth,
63
+ params=params,
64
+ )
65
+ res.raise_for_status()
66
+ return AnalyticsReportResponse.from_json(res.text) # type: ignore
67
+
68
+ def list_report_instances(
69
+ self,
70
+ report_id: str,
71
+ granularity: str = "DAILY",
72
+ ) -> AnalyticsReportInstancesResponse:
73
+ data = []
74
+ url = f"https://api.appstoreconnect.apple.com/v1/analyticsReports/{report_id}/instances"
75
+ params: Optional[dict] = {"filter[granularity]": granularity}
76
+
77
+ while url:
78
+ res = requests.get(url, auth=self.auth, params=params)
79
+ res.raise_for_status()
80
+
81
+ response_data = AnalyticsReportInstancesResponse.from_json(res.text) # type: ignore
82
+ data.extend(response_data.data)
83
+
84
+ url = response_data.links.next
85
+ params = None # Clear params for subsequent requests
86
+
87
+ return AnalyticsReportInstancesResponse(
88
+ data=data,
89
+ links=response_data.links,
90
+ meta=response_data.meta,
91
+ )
92
+
93
+ def list_report_segments(self, instance_id: str) -> AnalyticsReportSegmentsResponse:
94
+ segments = []
95
+ url = f"https://api.appstoreconnect.apple.com/v1/analyticsReportInstances/{instance_id}/segments"
96
+
97
+ while url:
98
+ res = requests.get(url, auth=self.auth)
99
+ res.raise_for_status()
100
+
101
+ response_data = AnalyticsReportSegmentsResponse.from_json(res.text) # type: ignore
102
+ segments.extend(response_data.data)
103
+
104
+ url = response_data.links.next
105
+
106
+ return AnalyticsReportSegmentsResponse(
107
+ data=segments, links=response_data.links, meta=response_data.meta
108
+ )
109
+
110
+ def auth(self, req: PreparedRequest) -> PreparedRequest:
111
+ headers = {
112
+ "alg": "ES256",
113
+ "kid": self.__key_id,
114
+ }
115
+ payload = {
116
+ "iss": self.__issuer_id,
117
+ "exp": int(time.time()) + 600,
118
+ "aud": "appstoreconnect-v1",
119
+ }
120
+ req.headers["Authorization"] = jwt.encode(
121
+ payload,
122
+ self.__key,
123
+ algorithm="ES256",
124
+ headers=headers,
125
+ )
126
+ return req
@@ -0,0 +1,15 @@
1
+ class NoReportsFoundError(Exception):
2
+ def __init__(self):
3
+ super().__init__("No Report instances found for the given date range")
4
+
5
+
6
+ class NoOngoingReportRequestsFoundError(Exception):
7
+ def __init__(self):
8
+ super().__init__(
9
+ "No ONGOING report requests found (or they're stopped due to inactivity)"
10
+ )
11
+
12
+
13
+ class NoSuchReportError(Exception):
14
+ def __init__(self, report_name):
15
+ super().__init__(f"No such report found: {report_name}")
@@ -0,0 +1,117 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+
4
+ from dataclasses_json import dataclass_json
5
+
6
+
7
+ @dataclass_json
8
+ @dataclass
9
+ class Links:
10
+ self: str
11
+ next: Optional[str] = None
12
+
13
+
14
+ @dataclass_json
15
+ @dataclass
16
+ class ReportRequestAttributes:
17
+ accessType: str
18
+ stoppedDueToInactivity: bool
19
+
20
+
21
+ @dataclass_json
22
+ @dataclass
23
+ class ReportAttributes:
24
+ name: str
25
+ category: str
26
+
27
+
28
+ @dataclass_json
29
+ @dataclass
30
+ class ReportInstanceAttributes:
31
+ granularity: str
32
+ processingDate: str
33
+
34
+
35
+ @dataclass_json
36
+ @dataclass
37
+ class ReportSegmentAttributes:
38
+ checksum: str
39
+ url: str
40
+ sizeInBytes: int
41
+
42
+
43
+ @dataclass_json
44
+ @dataclass
45
+ class ReportRequest:
46
+ type: str
47
+ id: str
48
+ attributes: ReportRequestAttributes
49
+
50
+
51
+ @dataclass_json
52
+ @dataclass
53
+ class Report:
54
+ type: str
55
+ id: str
56
+ attributes: ReportAttributes
57
+
58
+
59
+ @dataclass_json
60
+ @dataclass
61
+ class ReportInstance:
62
+ type: str
63
+ id: str
64
+ attributes: ReportInstanceAttributes
65
+
66
+
67
+ @dataclass_json
68
+ @dataclass
69
+ class ReportSegment:
70
+ type: str
71
+ id: str
72
+ attributes: ReportSegmentAttributes
73
+
74
+
75
+ @dataclass_json
76
+ @dataclass
77
+ class PagingMeta:
78
+ total: int
79
+ limit: int
80
+
81
+
82
+ @dataclass_json
83
+ @dataclass
84
+ class Meta:
85
+ paging: PagingMeta
86
+
87
+
88
+ @dataclass_json
89
+ @dataclass
90
+ class AnalyticsReportRequestsResponse:
91
+ data: List[ReportRequest]
92
+ meta: Meta
93
+ links: Links
94
+
95
+
96
+ @dataclass_json
97
+ @dataclass
98
+ class AnalyticsReportResponse:
99
+ data: List[Report]
100
+ meta: Meta
101
+ links: Links
102
+
103
+
104
+ @dataclass_json
105
+ @dataclass
106
+ class AnalyticsReportInstancesResponse:
107
+ data: List[ReportInstance]
108
+ meta: Meta
109
+ links: Links
110
+
111
+
112
+ @dataclass_json
113
+ @dataclass
114
+ class AnalyticsReportSegmentsResponse:
115
+ data: List[ReportSegment]
116
+ meta: Meta
117
+ links: Links
@@ -0,0 +1,179 @@
1
+ from dataclasses import dataclass
2
+ from typing import List
3
+
4
+
5
+ @dataclass
6
+ class ResourceConfig:
7
+ name: str
8
+ primary_key: List[str]
9
+ columns: dict
10
+ report_name: str
11
+
12
+
13
+ RESOURCES: List[ResourceConfig] = [
14
+ ResourceConfig(
15
+ name="app-downloads-detailed",
16
+ primary_key=[
17
+ "App Apple Identifier",
18
+ "App Name",
19
+ "App Version",
20
+ "Campaign",
21
+ "Date",
22
+ "Device",
23
+ "Download Type",
24
+ "Page Title",
25
+ "Page Type",
26
+ "Platform Version",
27
+ "Pre-Order",
28
+ "Source Info",
29
+ "Source Type",
30
+ "Territory",
31
+ ],
32
+ columns={
33
+ "Date": {"data_type": "date"},
34
+ "App Apple Identifier": {"data_type": "bigint"},
35
+ "Counts": {"data_type": "bigint"},
36
+ "processing_date": {"data_type": "date"},
37
+ },
38
+ report_name="App Downloads Detailed",
39
+ ),
40
+ ResourceConfig(
41
+ name="app-store-discovery-and-engagement-detailed",
42
+ primary_key=[
43
+ "App Apple Identifier",
44
+ "App Name",
45
+ "Campaign",
46
+ "Date",
47
+ "Device",
48
+ "Engagement Type",
49
+ "Event",
50
+ "Page Title",
51
+ "Page Type",
52
+ "Platform Version",
53
+ "Source Info",
54
+ "Source Type",
55
+ "Territory",
56
+ ],
57
+ columns={
58
+ "Date": {"data_type": "date"},
59
+ "App Apple Identifier": {"data_type": "bigint"},
60
+ "Counts": {"data_type": "bigint"},
61
+ "Unique Counts": {"data_type": "bigint"},
62
+ "processing_date": {"data_type": "date"},
63
+ },
64
+ report_name="App Store Discovery and Engagement Detailed",
65
+ ),
66
+ ResourceConfig(
67
+ name="app-sessions-detailed",
68
+ primary_key=[
69
+ "Date",
70
+ "App Name",
71
+ "App Apple Identifier",
72
+ "App Version",
73
+ "Device",
74
+ "Platform Version",
75
+ "Source Type",
76
+ "Source Info",
77
+ "Campaign",
78
+ "Page Type",
79
+ "Page Title",
80
+ "App Download Date",
81
+ "Territory",
82
+ ],
83
+ columns={
84
+ "Date": {"data_type": "date"},
85
+ "App Apple Identifier": {"data_type": "bigint"},
86
+ "Sessions": {"data_type": "bigint"},
87
+ "Total Session Duration": {"data_type": "bigint"},
88
+ "Unique Devices": {"data_type": "bigint"},
89
+ "processing_date": {"data_type": "date"},
90
+ },
91
+ report_name="App Sessions Detailed",
92
+ ),
93
+ ResourceConfig(
94
+ name="app-store-installation-and-deletion-detailed",
95
+ primary_key=[
96
+ "App Apple Identifier",
97
+ "App Download Date",
98
+ "App Name",
99
+ "App Version",
100
+ "Campaign",
101
+ "Counts",
102
+ "Date",
103
+ "Device",
104
+ "Download Type",
105
+ "Event",
106
+ "Page Title",
107
+ "Page Type",
108
+ "Platform Version",
109
+ "Source Info",
110
+ "Source Type",
111
+ "Territory",
112
+ "Unique Devices",
113
+ ],
114
+ columns={
115
+ "Date": {"data_type": "date"},
116
+ "App Apple Identifier": {"data_type": "bigint"},
117
+ "Counts": {"data_type": "bigint"},
118
+ "Unique Devices": {"data_type": "bigint"},
119
+ "App Download Date": {"data_type": "date"},
120
+ "processing_date": {"data_type": "date"},
121
+ },
122
+ report_name="App Store Installation and Deletion Detailed",
123
+ ),
124
+ ResourceConfig(
125
+ name="app-store-purchases-detailed",
126
+ primary_key=[
127
+ "App Apple Identifier",
128
+ "App Download Date",
129
+ "App Name",
130
+ "Campaign",
131
+ "Content Apple Identifier",
132
+ "Content Name",
133
+ "Date",
134
+ "Device",
135
+ "Page Title",
136
+ "Page Type",
137
+ "Payment Method",
138
+ "Platform Version",
139
+ "Pre-Order",
140
+ "Purchase Type",
141
+ "Source Info",
142
+ "Source Type",
143
+ "Territory",
144
+ ],
145
+ columns={
146
+ "Date": {"data_type": "date"},
147
+ "App Apple Identifier": {"data_type": "bigint"},
148
+ "App Download Date": {"data_type": "date"},
149
+ "Content Apple Identifier": {"data_type": "bigint"},
150
+ "Purchases": {"data_type": "bigint"},
151
+ "Proceeds In USD": {"data_type": "double"},
152
+ "Sales In USD": {"data_type": "double"},
153
+ "Paying Users": {"data_type": "bigint"},
154
+ "processing_date": {"data_type": "date"},
155
+ },
156
+ report_name="App Store Purchases Detailed",
157
+ ),
158
+ ResourceConfig(
159
+ name="app-crashes-expanded",
160
+ primary_key=[
161
+ "App Name",
162
+ "App Version",
163
+ "Build",
164
+ "Date",
165
+ "Device",
166
+ "Platform",
167
+ "Release Type",
168
+ "Territory",
169
+ ],
170
+ columns={
171
+ "Date": {"data_type": "date"},
172
+ "processing_date": {"data_type": "date"},
173
+ "App Apple Identifier": {"data_type": "bigint"},
174
+ "Count": {"data_type": "bigint"},
175
+ "Unique Devices": {"data_type": "bigint"},
176
+ },
177
+ report_name="App Crashes Expanded",
178
+ ),
179
+ ]
@@ -150,7 +150,10 @@ def tasks(
150
150
  project_array: t.List[TDataItem],
151
151
  access_token: str = dlt.secrets.value,
152
152
  modified_at: dlt.sources.incremental[str] = dlt.sources.incremental(
153
- "modified_at", initial_value=DEFAULT_START_DATE
153
+ "modified_at",
154
+ initial_value=DEFAULT_START_DATE,
155
+ range_end="closed",
156
+ range_start="closed",
154
157
  ),
155
158
  fields: Iterable[str] = TASK_FIELDS,
156
159
  ) -> Iterable[TDataItem]:
ingestr/src/errors.py ADDED
@@ -0,0 +1,10 @@
1
+ class MissingValueError(Exception):
2
+ def __init__(self, value, source):
3
+ super().__init__(f"{value} is required to connect to {source}")
4
+
5
+
6
+ class UnsupportedResourceError(Exception):
7
+ def __init__(self, resource, source):
8
+ super().__init__(
9
+ f"Resource '{resource}' is not supported for {source} source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
10
+ )
@@ -159,7 +159,10 @@ def facebook_insights_source(
159
159
  )
160
160
  def facebook_insights(
161
161
  date_start: dlt.sources.incremental[str] = dlt.sources.incremental(
162
- "date_start", initial_value=initial_load_start_date_str
162
+ "date_start",
163
+ initial_value=initial_load_start_date_str,
164
+ range_end="closed",
165
+ range_start="closed",
163
166
  ),
164
167
  ) -> Iterator[TDataItems]:
165
168
  start_date = get_start_date(date_start, attribution_window_days_lag)
ingestr/src/factory.py CHANGED
@@ -18,6 +18,7 @@ from ingestr.src.destinations import (
18
18
  from ingestr.src.sources import (
19
19
  AdjustSource,
20
20
  AirtableSource,
21
+ AppleAppStoreSource,
21
22
  AppsflyerSource,
22
23
  ArrowMemoryMappedSource,
23
24
  AsanaSource,
@@ -122,6 +123,7 @@ class SourceDestinationFactory:
122
123
  "asana": AsanaSource,
123
124
  "tiktok": TikTokSource,
124
125
  "googleanalytics": GoogleAnalyticsSource,
126
+ "appstore": AppleAppStoreSource,
125
127
  }
126
128
  destinations: Dict[str, Type[DestinationProtocol]] = {
127
129
  "bigquery": BigQueryDestination,
@@ -38,7 +38,9 @@ def readers(
38
38
  """
39
39
  filesystem_resource = filesystem(bucket_url, credentials, file_glob=file_glob)
40
40
  filesystem_resource.apply_hints(
41
- incremental=dlt.sources.incremental("modification_date")
41
+ incremental=dlt.sources.incremental("modification_date"),
42
+ range_end="closed",
43
+ range_start="closed",
42
44
  )
43
45
  return (
44
46
  filesystem_resource | dlt.transformer(name="read_csv")(_read_csv),
@@ -14,7 +14,7 @@ from .helpers import get_reactions_data, get_rest_pages, get_stargazers
14
14
  def github_reactions(
15
15
  owner: str,
16
16
  name: str,
17
- access_token: str = dlt.secrets.value,
17
+ access_token: str,
18
18
  items_per_page: int = 100,
19
19
  max_items: Optional[int] = None,
20
20
  ) -> Sequence[DltResource]:
@@ -89,7 +89,11 @@ def github_repo_events(
89
89
  @dlt.resource(primary_key="id", table_name=lambda i: i["type"])
90
90
  def repo_events(
91
91
  last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
92
- "created_at", initial_value="1970-01-01T00:00:00Z", last_value_func=max
92
+ "created_at",
93
+ initial_value="1970-01-01T00:00:00Z",
94
+ last_value_func=max,
95
+ range_end="closed",
96
+ range_start="closed",
93
97
  ),
94
98
  ) -> Iterator[TDataItems]:
95
99
  repos_path = (
@@ -114,7 +118,7 @@ def github_repo_events(
114
118
  def github_stargazers(
115
119
  owner: str,
116
120
  name: str,
117
- access_token: str = dlt.secrets.value,
121
+ access_token: str,
118
122
  items_per_page: int = 100,
119
123
  max_items: Optional[int] = None,
120
124
  ) -> Sequence[DltResource]: