ingestr 0.12.5__py3-none-any.whl → 0.12.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -2,26 +2,32 @@
2
2
  Defines all the sources and resources needed for Google Analytics V4
3
3
  """
4
4
 
5
- from typing import List, Optional, Union
5
+ from typing import Iterator, List, Optional, Union
6
6
 
7
7
  import dlt
8
- from dlt.common.typing import DictStrAny
9
- from dlt.sources import DltResource
8
+ from dlt.common import pendulum
9
+ from dlt.common.typing import DictStrAny, TDataItem
10
+ from dlt.extract import DltResource
10
11
  from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
11
12
  from google.analytics.data_v1beta import BetaAnalyticsDataClient
13
+ from google.analytics.data_v1beta.types import (
14
+ Dimension,
15
+ Metric,
16
+ )
12
17
 
13
- from .helpers import basic_report
18
+ from .helpers import get_report
14
19
 
15
20
 
16
21
  @dlt.source(max_table_nesting=0)
17
22
  def google_analytics(
18
- datetime: str,
23
+ datetime_dimension: str,
19
24
  credentials: Union[
20
25
  GcpOAuthCredentials, GcpServiceAccountCredentials
21
26
  ] = dlt.secrets.value,
22
27
  property_id: int = dlt.config.value,
23
28
  queries: List[DictStrAny] = dlt.config.value,
24
- start_date: Optional[str] = "2015-08-14",
29
+ start_date: Optional[pendulum.DateTime] = pendulum.datetime(2024, 1, 1),
30
+ end_date: Optional[pendulum.DateTime] = None,
25
31
  rows_per_page: int = 10000,
26
32
  ) -> List[DltResource]:
27
33
  try:
@@ -50,21 +56,51 @@ def google_analytics(
50
56
 
51
57
  # always add "date" to dimensions so we are able to track the last day of a report
52
58
  dimensions = query["dimensions"]
53
- resource_name = query["resource_name"]
54
59
 
55
- res = dlt.resource(
56
- basic_report, name="basic_report", merge_key=datetime, write_disposition="merge"
57
- )(
58
- client=client,
59
- rows_per_page=rows_per_page,
60
- property_id=property_id,
61
- dimensions=dimensions,
62
- metrics=query["metrics"],
63
- resource_name=resource_name,
64
- start_date=start_date,
65
- last_date=dlt.sources.incremental(
66
- datetime
67
- ), # pass empty primary key to avoid unique checks, a primary key defined by the resource will be used
60
+ @dlt.resource(
61
+ name="basic_report",
62
+ merge_key=datetime_dimension,
63
+ write_disposition="merge",
68
64
  )
65
+ def basic_report(
66
+ incremental=dlt.sources.incremental(
67
+ datetime_dimension,
68
+ initial_value=start_date,
69
+ end_value=end_date,
70
+ range_end="closed",
71
+ range_start="closed",
72
+ ),
73
+ ) -> Iterator[TDataItem]:
74
+ start_date = incremental.last_value
75
+ end_date = incremental.end_value
76
+ if start_date is None:
77
+ start_date = pendulum.datetime(2024, 1, 1)
78
+ if end_date is None:
79
+ end_date = pendulum.yesterday()
80
+ yield from get_report(
81
+ client=client,
82
+ property_id=property_id,
83
+ dimension_list=[Dimension(name=dimension) for dimension in dimensions],
84
+ metric_list=[Metric(name=metric) for metric in query["metrics"]],
85
+ per_page=rows_per_page,
86
+ start_date=start_date,
87
+ end_date=end_date,
88
+ )
89
+
90
+ # res = dlt.resource(
91
+ # basic_report, name="basic_report", merge_key=datetime_dimension, write_disposition="merge"
92
+ # )(
93
+ # client=client,
94
+ # rows_per_page=rows_per_page,
95
+ # property_id=property_id,
96
+ # dimensions=dimensions,
97
+ # metrics=query["metrics"],
98
+ # resource_name=resource_name,
99
+ # last_date=dlt.sources.incremental(
100
+ # datetime_dimension,
101
+ # initial_value=start_date,
102
+ # end_value=end_date,
103
+ # ),
104
+ # )
69
105
 
70
- return [res]
106
+ return [basic_report]
@@ -57,9 +57,9 @@ def get_report(
57
57
  property_id: int,
58
58
  dimension_list: List[Dimension],
59
59
  metric_list: List[Metric],
60
- limit: int,
61
- start_date: str,
62
- end_date: str,
60
+ per_page: int,
61
+ start_date: pendulum.DateTime,
62
+ end_date: pendulum.DateTime,
63
63
  ) -> Iterator[TDataItem]:
64
64
  """
65
65
  Gets all the possible pages of reports with the given query parameters.
@@ -79,30 +79,36 @@ def get_report(
79
79
  Generator of all rows of data in the report.
80
80
  """
81
81
 
82
- request = RunReportRequest(
83
- property=f"properties/{property_id}",
84
- dimensions=dimension_list,
85
- metrics=metric_list,
86
- limit=limit,
87
- date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
82
+ print(
83
+ "fetching for daterange", start_date.to_date_string(), end_date.to_date_string()
88
84
  )
89
- # process request
90
- response = client.run_report(request)
91
- processed_response_generator = process_report(response=response)
92
- yield from processed_response_generator
93
85
 
86
+ offset = 0
87
+ while True:
88
+ request = RunReportRequest(
89
+ property=f"properties/{property_id}",
90
+ dimensions=dimension_list,
91
+ metrics=metric_list,
92
+ limit=per_page,
93
+ offset=offset,
94
+ date_ranges=[
95
+ DateRange(
96
+ start_date=start_date.to_date_string(),
97
+ end_date=end_date.to_date_string(),
98
+ )
99
+ ],
100
+ )
101
+ # process request
102
+ response = client.run_report(request)
103
+ processed_response_generator = process_report(response=response)
104
+ # import pdb; pdb.set_trace()
105
+ yield from processed_response_generator
106
+ offset += per_page
107
+ if len(response.rows) < per_page or offset > 1000000:
108
+ break
94
109
 
95
- def process_report(response: RunReportResponse) -> Iterator[TDataItems]:
96
- """
97
- Receives a single page for a report response, processes it, and returns a generator for every row of data in the report page.
98
-
99
- Args:
100
- response: The API response for a single page of the report.
101
-
102
- Yields:
103
- Generator of dictionaries for every row of the report page.
104
- """
105
110
 
111
+ def process_report(response: RunReportResponse) -> Iterator[TDataItems]:
106
112
  metrics_headers = [header.name for header in response.metric_headers]
107
113
  dimensions_headers = [header.name for header in response.dimension_headers]
108
114
 
@@ -156,16 +162,6 @@ def process_metric_value(metric_type: MetricType, value: str) -> Union[str, int,
156
162
 
157
163
 
158
164
  def _resolve_dimension_value(dimension_name: str, dimension_value: str) -> Any:
159
- """
160
- Helper function that receives a dimension's name and value and converts it to a datetime object if needed.
161
-
162
- Args:
163
- dimension_name: Name of the dimension.
164
- dimension_value: Value of the dimension.
165
-
166
- Returns:
167
- The value of the dimension with the correct data type.
168
- """
169
165
  if dimension_name == "date":
170
166
  return pendulum.from_format(dimension_value, "YYYYMMDD", tz="UTC")
171
167
  elif dimension_name == "dateHour":
@@ -116,7 +116,9 @@ def gorgias_source(
116
116
  },
117
117
  )
118
118
  def customers(
119
- updated_datetime=dlt.sources.incremental("updated_datetime", start_date_obj),
119
+ updated_datetime=dlt.sources.incremental(
120
+ "updated_datetime", start_date_obj, range_end="closed", range_start="closed"
121
+ ),
120
122
  ) -> Iterable[TDataItem]:
121
123
  """
122
124
  The resource for customers on your Gorgias domain, supports incremental loading and pagination.
@@ -290,7 +292,9 @@ def gorgias_source(
290
292
  },
291
293
  )
292
294
  def tickets(
293
- updated_datetime=dlt.sources.incremental("updated_datetime", start_date_obj),
295
+ updated_datetime=dlt.sources.incremental(
296
+ "updated_datetime", start_date_obj, range_end="closed", range_start="closed"
297
+ ),
294
298
  ) -> Iterable[TDataItem]:
295
299
  """
296
300
  The resource for tickets on your Gorgias domain, supports incremental loading and pagination.
@@ -481,7 +485,9 @@ def gorgias_source(
481
485
  },
482
486
  )
483
487
  def ticket_messages(
484
- updated_datetime=dlt.sources.incremental("updated_datetime", start_date_obj),
488
+ updated_datetime=dlt.sources.incremental(
489
+ "updated_datetime", start_date_obj, range_end="closed", range_start="closed"
490
+ ),
485
491
  ) -> Iterable[TDataItem]:
486
492
  """
487
493
  The resource for ticket messages on your Gorgias domain, supports incremental loading and pagination.
@@ -566,7 +572,9 @@ def gorgias_source(
566
572
  },
567
573
  )
568
574
  def satisfaction_surveys(
569
- updated_datetime=dlt.sources.incremental("updated_datetime", start_date_obj),
575
+ updated_datetime=dlt.sources.incremental(
576
+ "updated_datetime", start_date_obj, range_end="closed", range_start="closed"
577
+ ),
570
578
  ) -> Iterable[TDataItem]:
571
579
  """
572
580
  The resource for satisfaction surveys on your Gorgias domain, supports incremental loading and pagination.
@@ -278,4 +278,11 @@ def hubspot_events_for_objects(
278
278
  write_disposition="append",
279
279
  selected=True,
280
280
  table_name=lambda e: name + "_" + str(e["eventType"]),
281
- )(dlt.sources.incremental("occurredAt", initial_value=start_date.isoformat()))
281
+ )(
282
+ dlt.sources.incremental(
283
+ "occurredAt",
284
+ initial_value=start_date.isoformat(),
285
+ range_end="closed",
286
+ range_start="closed",
287
+ )
288
+ )
@@ -33,7 +33,12 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
33
33
 
34
34
  @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
35
35
  def events(
36
- datetime=dlt.sources.incremental("datetime", start_date_obj.isoformat()),
36
+ datetime=dlt.sources.incremental(
37
+ "datetime",
38
+ start_date_obj.isoformat(),
39
+ range_end="closed",
40
+ range_start="closed",
41
+ ),
37
42
  ) -> Iterable[TDataItem]:
38
43
  intervals = split_date_range(
39
44
  pendulum.parse(datetime.start_value), pendulum.now()
@@ -44,7 +49,12 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
44
49
 
45
50
  @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
46
51
  def profiles(
47
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
52
+ updated=dlt.sources.incremental(
53
+ "updated",
54
+ start_date_obj.isoformat(),
55
+ range_end="closed",
56
+ range_start="closed",
57
+ ),
48
58
  ) -> Iterable[TDataItem]:
49
59
  intervals = split_date_range(
50
60
  pendulum.parse(updated.start_value), pendulum.now()
@@ -55,7 +65,12 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
55
65
 
56
66
  @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
57
67
  def campaigns(
58
- updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
68
+ updated_at=dlt.sources.incremental(
69
+ "updated_at",
70
+ start_date_obj.isoformat(),
71
+ range_end="closed",
72
+ range_start="closed",
73
+ ),
59
74
  ) -> Iterable[TDataItem]:
60
75
  intervals = split_date_range(
61
76
  pendulum.parse(updated_at.start_value), pendulum.now()
@@ -69,7 +84,12 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
69
84
 
70
85
  @dlt.resource(write_disposition="merge", primary_key="id")
71
86
  def metrics(
72
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
87
+ updated=dlt.sources.incremental(
88
+ "updated",
89
+ start_date_obj.isoformat(),
90
+ range_end="closed",
91
+ range_start="closed",
92
+ ),
73
93
  ) -> Iterable[TDataItem]:
74
94
  yield from client.fetch_metrics(create_client(), updated.start_value)
75
95
 
@@ -83,7 +103,12 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
83
103
 
84
104
  @dlt.resource(write_disposition="merge", primary_key="id", name="catalog-variants")
85
105
  def catalog_variants(
86
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
106
+ updated=dlt.sources.incremental(
107
+ "updated",
108
+ start_date_obj.isoformat(),
109
+ range_end="closed",
110
+ range_start="closed",
111
+ ),
87
112
  ) -> Iterable[TDataItem]:
88
113
  yield from client.fetch_catalog_variant(create_client(), updated.start_value)
89
114
 
@@ -91,19 +116,34 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
91
116
  write_disposition="merge", primary_key="id", name="catalog-categories"
92
117
  )
93
118
  def catalog_categories(
94
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
119
+ updated=dlt.sources.incremental(
120
+ "updated",
121
+ start_date_obj.isoformat(),
122
+ range_end="closed",
123
+ range_start="closed",
124
+ ),
95
125
  ) -> Iterable[TDataItem]:
96
126
  yield from client.fetch_catalog_categories(create_client(), updated.start_value)
97
127
 
98
128
  @dlt.resource(write_disposition="merge", primary_key="id", name="catalog-items")
99
129
  def catalog_items(
100
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
130
+ updated=dlt.sources.incremental(
131
+ "updated",
132
+ start_date_obj.isoformat(),
133
+ range_end="closed",
134
+ range_start="closed",
135
+ ),
101
136
  ) -> Iterable[TDataItem]:
102
137
  yield from client.fetch_catalog_item(create_client(), updated.start_value)
103
138
 
104
139
  @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
105
140
  def forms(
106
- updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
141
+ updated_at=dlt.sources.incremental(
142
+ "updated_at",
143
+ start_date_obj.isoformat(),
144
+ range_end="closed",
145
+ range_start="closed",
146
+ ),
107
147
  ) -> Iterable[TDataItem]:
108
148
  intervals = split_date_range(
109
149
  pendulum.parse(updated_at.start_value), pendulum.now()
@@ -114,13 +154,23 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
114
154
 
115
155
  @dlt.resource(write_disposition="merge", primary_key="id")
116
156
  def lists(
117
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
157
+ updated=dlt.sources.incremental(
158
+ "updated",
159
+ start_date_obj.isoformat(),
160
+ range_end="closed",
161
+ range_start="closed",
162
+ ),
118
163
  ) -> Iterable[TDataItem]:
119
164
  yield from client.fetch_lists(create_client(), updated.start_value)
120
165
 
121
166
  @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
122
167
  def images(
123
- updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
168
+ updated_at=dlt.sources.incremental(
169
+ "updated_at",
170
+ start_date_obj.isoformat(),
171
+ range_end="closed",
172
+ range_start="closed",
173
+ ),
124
174
  ) -> Iterable[TDataItem]:
125
175
  intervals = split_date_range(
126
176
  pendulum.parse(updated_at.start_value), pendulum.now()
@@ -130,13 +180,23 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
130
180
 
131
181
  @dlt.resource(write_disposition="merge", primary_key="id")
132
182
  def segments(
133
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
183
+ updated=dlt.sources.incremental(
184
+ "updated",
185
+ start_date_obj.isoformat(),
186
+ range_end="closed",
187
+ range_start="closed",
188
+ ),
134
189
  ) -> Iterable[TDataItem]:
135
190
  yield from client.fetch_segments(create_client(), updated.start_value)
136
191
 
137
192
  @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
138
193
  def flows(
139
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
194
+ updated=dlt.sources.incremental(
195
+ "updated",
196
+ start_date_obj.isoformat(),
197
+ range_end="closed",
198
+ range_start="closed",
199
+ ),
140
200
  ) -> Iterable[TDataItem]:
141
201
  intervals = split_date_range(
142
202
  pendulum.parse(updated.start_value), pendulum.now()
@@ -146,7 +206,12 @@ def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResour
146
206
 
147
207
  @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
148
208
  def templates(
149
- updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
209
+ updated=dlt.sources.incremental(
210
+ "updated",
211
+ start_date_obj.isoformat(),
212
+ range_end="closed",
213
+ range_start="closed",
214
+ ),
150
215
  ) -> Iterable[TDataItem]:
151
216
  intervals = split_date_range(
152
217
  pendulum.parse(updated.start_value), pendulum.now()
@@ -158,6 +158,8 @@ def shopify_source(
158
158
  initial_value=start_date_obj,
159
159
  end_value=end_date_obj,
160
160
  allow_external_schedulers=True,
161
+ range_end="closed",
162
+ range_start="closed",
161
163
  ),
162
164
  created_at_min: pendulum.DateTime = created_at_min_obj,
163
165
  items_per_page: int = items_per_page,
@@ -606,6 +608,8 @@ def shopify_source(
606
608
  initial_value=start_date_obj,
607
609
  end_value=end_date_obj,
608
610
  allow_external_schedulers=True,
611
+ range_end="closed",
612
+ range_start="closed",
609
613
  ),
610
614
  created_at_min: pendulum.DateTime = created_at_min_obj,
611
615
  items_per_page: int = items_per_page,
@@ -640,6 +644,8 @@ def shopify_source(
640
644
  initial_value=start_date_obj,
641
645
  end_value=end_date_obj,
642
646
  allow_external_schedulers=True,
647
+ range_end="closed",
648
+ range_start="closed",
643
649
  ),
644
650
  created_at_min: pendulum.DateTime = created_at_min_obj,
645
651
  items_per_page: int = items_per_page,
@@ -671,6 +677,8 @@ def shopify_source(
671
677
  "created_at",
672
678
  initial_value=start_date_obj,
673
679
  end_value=end_date_obj,
680
+ range_end="closed",
681
+ range_start="closed",
674
682
  ),
675
683
  items_per_page: int = items_per_page,
676
684
  ) -> Iterable[TDataItem]:
@@ -689,6 +697,8 @@ def shopify_source(
689
697
  "updated_at",
690
698
  initial_value=start_date_obj,
691
699
  end_value=end_date_obj,
700
+ range_end="closed",
701
+ range_start="closed",
692
702
  ),
693
703
  items_per_page: int = items_per_page,
694
704
  ) -> Iterable[TDataItem]:
@@ -730,6 +740,8 @@ def shopify_source(
730
740
  initial_value=start_date_obj,
731
741
  end_value=end_date_obj,
732
742
  allow_external_schedulers=True,
743
+ range_end="closed",
744
+ range_start="closed",
733
745
  ),
734
746
  items_per_page: int = items_per_page,
735
747
  ) -> Iterable[TDataItem]:
@@ -1807,6 +1819,8 @@ query discountNodes($after: String, $query: String, $first: Int) {
1807
1819
  "updatedAt",
1808
1820
  initial_value=start_date_obj,
1809
1821
  end_value=end_date_obj,
1822
+ range_end="closed",
1823
+ range_start="closed",
1810
1824
  ),
1811
1825
  items_per_page: int = items_per_page,
1812
1826
  ) -> Iterable[TDataItem]:
@@ -175,6 +175,8 @@ def slack_source(
175
175
  initial_value=start_dt,
176
176
  end_value=end_dt,
177
177
  allow_external_schedulers=True,
178
+ range_end="closed",
179
+ range_start="closed",
178
180
  ),
179
181
  ) -> Iterable[TDataItem]:
180
182
  """
@@ -198,6 +200,8 @@ def slack_source(
198
200
  initial_value=start_dt,
199
201
  end_value=end_dt,
200
202
  allow_external_schedulers=True,
203
+ range_end="closed",
204
+ range_start="closed",
201
205
  ),
202
206
  ) -> Iterable[TDataItem]:
203
207
  """Yield all messages for a given channel as a DLT resource. Keep blocks column without normalization.
ingestr/src/sources.py CHANGED
@@ -3,7 +3,7 @@ import csv
3
3
  import json
4
4
  import os
5
5
  import re
6
- from datetime import date, datetime
6
+ from datetime import date, datetime, timedelta
7
7
  from typing import (
8
8
  Any,
9
9
  Callable,
@@ -46,10 +46,16 @@ from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
46
46
  from ingestr.src.adjust.adjust_helpers import parse_filters
47
47
  from ingestr.src.airtable import airtable_source
48
48
  from ingestr.src.appsflyer._init_ import appsflyer_source
49
+ from ingestr.src.appstore import app_store
50
+ from ingestr.src.appstore.client import AppStoreConnectClient
49
51
  from ingestr.src.arrow import memory_mapped_arrow
50
52
  from ingestr.src.asana_source import asana_source
51
53
  from ingestr.src.chess import source
52
54
  from ingestr.src.dynamodb import dynamodb
55
+ from ingestr.src.errors import (
56
+ MissingValueError,
57
+ UnsupportedResourceError,
58
+ )
53
59
  from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
54
60
  from ingestr.src.filesystem import readers
55
61
  from ingestr.src.filters import table_adapter_exclude_columns
@@ -240,6 +246,8 @@ class ArrowMemoryMappedSource:
240
246
  kwargs.get("incremental_key", ""),
241
247
  initial_value=start_value,
242
248
  end_value=end_value,
249
+ range_end="closed",
250
+ range_start="closed",
243
251
  )
244
252
 
245
253
  file_path = uri.split("://")[1]
@@ -285,6 +293,8 @@ class MongoDbSource:
285
293
  kwargs.get("incremental_key", ""),
286
294
  initial_value=start_value,
287
295
  end_value=end_value,
296
+ range_end="closed",
297
+ range_start="closed",
288
298
  )
289
299
 
290
300
  table_instance = self.table_builder(
@@ -353,6 +363,8 @@ class LocalCsvSource:
353
363
  kwargs.get("incremental_key", ""),
354
364
  initial_value=kwargs.get("interval_start"),
355
365
  end_value=kwargs.get("interval_end"),
366
+ range_end="closed",
367
+ range_start="closed",
356
368
  )
357
369
  )
358
370
 
@@ -1311,6 +1323,8 @@ class DynamoDBSource:
1311
1323
  incremental_key.strip(),
1312
1324
  initial_value=isotime(kwargs.get("interval_start")),
1313
1325
  end_value=isotime(kwargs.get("interval_end")),
1326
+ range_end="closed",
1327
+ range_start="closed",
1314
1328
  )
1315
1329
 
1316
1330
  return dynamodb(table, creds, incremental)
@@ -1336,11 +1350,6 @@ class GoogleAnalyticsSource:
1336
1350
  if not property_id:
1337
1351
  raise ValueError("property_id is required to connect to Google Analytics")
1338
1352
 
1339
- interval_start = kwargs.get("interval_start")
1340
- start_date = (
1341
- interval_start.strftime("%Y-%m-%d") if interval_start else "2015-08-14"
1342
- )
1343
-
1344
1353
  fields = table.split(":")
1345
1354
  if len(fields) != 3:
1346
1355
  raise ValueError(
@@ -1364,10 +1373,19 @@ class GoogleAnalyticsSource:
1364
1373
  {"resource_name": "custom", "dimensions": dimensions, "metrics": metrics}
1365
1374
  ]
1366
1375
 
1376
+ start_date = pendulum.now().subtract(days=30).start_of("day")
1377
+ if kwargs.get("interval_start") is not None:
1378
+ start_date = pendulum.instance(kwargs.get("interval_start")) # type: ignore
1379
+
1380
+ end_date = pendulum.now()
1381
+ if kwargs.get("interval_end") is not None:
1382
+ end_date = pendulum.instance(kwargs.get("interval_end")) # type: ignore
1383
+
1367
1384
  return google_analytics(
1368
1385
  property_id=property_id[0],
1369
1386
  start_date=start_date,
1370
- datetime=datetime,
1387
+ end_date=end_date,
1388
+ datetime_dimension=datetime,
1371
1389
  queries=queries,
1372
1390
  credentials=credentials,
1373
1391
  ).with_resources("basic_report")
@@ -1398,9 +1416,7 @@ class GitHubSource:
1398
1416
  "repo variable is required to retrieve data for a specific repository from GitHub."
1399
1417
  )
1400
1418
 
1401
- access_token = source_fields.get("access_token", [None])[0]
1402
- if not access_token and table not in ["repo_events"]:
1403
- raise ValueError("access_token is required to connect with GitHub")
1419
+ access_token = source_fields.get("access_token", [""])[0]
1404
1420
 
1405
1421
  if table in ["issues", "pull_requests"]:
1406
1422
  return github_reactions(
@@ -1414,3 +1430,76 @@ class GitHubSource:
1414
1430
  raise ValueError(
1415
1431
  f"Resource '{table}' is not supported for GitHub source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1416
1432
  )
1433
+
1434
+
1435
+ class AppleAppStoreSource:
1436
+ def handles_incrementality(self) -> bool:
1437
+ return True
1438
+
1439
+ def init_client(
1440
+ self,
1441
+ key_id: str,
1442
+ issuer_id: str,
1443
+ key_path: Optional[List[str]],
1444
+ key_base64: Optional[List[str]],
1445
+ ):
1446
+ key = None
1447
+ if key_path is not None:
1448
+ with open(key_path[0]) as f:
1449
+ key = f.read()
1450
+ else:
1451
+ key = base64.b64decode(key_base64[0]).decode() # type: ignore
1452
+
1453
+ return AppStoreConnectClient(key.encode(), key_id, issuer_id)
1454
+
1455
+ def dlt_source(self, uri: str, table: str, **kwargs):
1456
+ if kwargs.get("incremental_key"):
1457
+ raise ValueError(
1458
+ "App Store takes care of incrementality on its own, you should not provide incremental_key"
1459
+ )
1460
+ parsed_uri = urlparse(uri)
1461
+ params = parse_qs(parsed_uri.query)
1462
+
1463
+ key_id = params.get("key_id")
1464
+ if key_id is None:
1465
+ raise MissingValueError("key_id", "App Store")
1466
+
1467
+ key_path = params.get("key_path")
1468
+ key_base64 = params.get("key_base64")
1469
+ key_available = any(
1470
+ map(
1471
+ lambda x: x is not None,
1472
+ [key_path, key_base64],
1473
+ )
1474
+ )
1475
+ if key_available is False:
1476
+ raise MissingValueError("key_path or key_base64", "App Store")
1477
+
1478
+ issuer_id = params.get("issuer_id")
1479
+ if issuer_id is None:
1480
+ raise MissingValueError("issuer_id", "App Store")
1481
+
1482
+ client = self.init_client(key_id[0], issuer_id[0], key_path, key_base64)
1483
+
1484
+ app_ids = params.get("app_id")
1485
+ if ":" in table:
1486
+ intended_table, app_ids_override = table.split(":", maxsplit=1)
1487
+ app_ids = app_ids_override.split(",")
1488
+ table = intended_table
1489
+
1490
+ if app_ids is None:
1491
+ raise MissingValueError("app_id", "App Store")
1492
+
1493
+ src = app_store(
1494
+ client,
1495
+ app_ids,
1496
+ start_date=kwargs.get(
1497
+ "interval_start", datetime.now() - timedelta(days=30)
1498
+ ),
1499
+ end_date=kwargs.get("interval_end"),
1500
+ )
1501
+
1502
+ if table not in src.resources:
1503
+ raise UnsupportedResourceError(table, "AppStore")
1504
+
1505
+ return src.with_resources(table)