ingestr 0.13.34__py3-none-any.whl → 0.13.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -9,7 +9,7 @@ import pyairtable
9
9
  from dlt.sources import DltResource
10
10
 
11
11
 
12
- @dlt.source
12
+ @dlt.source(max_table_nesting=1)
13
13
  def airtable_source(
14
14
  base_id: str = dlt.config.value,
15
15
  table_names: Optional[List[str]] = dlt.config.value,
@@ -50,12 +50,13 @@ def airtable_resource(
50
50
  It starts with "app". See https://support.airtable.com/docs/finding-airtable-ids
51
51
  table (Dict[str, Any]): Metadata about an airtable, does not contain the actual records
52
52
  """
53
+
53
54
  primary_key_id = table["primaryFieldId"]
54
55
  primary_key_field = [
55
56
  field for field in table["fields"] if field["id"] == primary_key_id
56
57
  ][0]
57
58
  table_name: str = table["name"]
58
- primary_key: List[str] = [f"fields__{primary_key_field['name']}"]
59
+ primary_key: List[str] = [f"fields__{primary_key_field['name']}".lower()]
59
60
  air_table = api.table(base_id, table["id"])
60
61
 
61
62
  # Table.iterate() supports rich customization options, such as chunk size, fields, cell format, timezone, locale, and view
ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.34"
1
+ version = "v0.13.36"
@@ -235,12 +235,19 @@ class AthenaDestination:
235
235
  if not bucket.startswith("s3://"):
236
236
  bucket = f"s3://{bucket}"
237
237
 
238
- query_result_path = source_params.get("query_results_path", [None])[0]
239
- if query_result_path:
240
- if not query_result_path.startswith("s3://"):
241
- query_result_path = f"s3://{query_result_path}"
242
- else:
243
- query_result_path = bucket
238
+ bucket = bucket.rstrip("/")
239
+
240
+ dest_table = kwargs.get("dest_table", None)
241
+ if not dest_table:
242
+ raise ValueError("A destination table is required to connect to Athena.")
243
+
244
+ dest_table_fields = dest_table.split(".")
245
+ if len(dest_table_fields) != 2:
246
+ raise ValueError(
247
+ f"Table name must be in the format <schema>.<table>, given: {dest_table}"
248
+ )
249
+
250
+ query_result_path = f"{bucket}/{dest_table_fields[0]}_staging/metadata"
244
251
 
245
252
  access_key_id = source_params.get("access_key_id", [None])[0]
246
253
  secret_access_key = source_params.get("secret_access_key", [None])[0]
@@ -285,6 +292,7 @@ class AthenaDestination:
285
292
  region_name=region_name,
286
293
  ),
287
294
  destination_name=bucket,
295
+ force_iceberg=True,
288
296
  )
289
297
 
290
298
  def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
ingestr/src/factory.py CHANGED
@@ -53,6 +53,7 @@ from ingestr.src.sources import (
53
53
  StripeAnalyticsSource,
54
54
  TikTokSource,
55
55
  ZendeskSource,
56
+ FreshdeskSource,
56
57
  )
57
58
 
58
59
  SQL_SOURCE_SCHEMES = [
@@ -148,6 +149,7 @@ class SourceDestinationFactory:
148
149
  "kinesis": KinesisSource,
149
150
  "pipedrive": PipedriveSource,
150
151
  "frankfurter": FrankfurterSource,
152
+ "freshdesk": FreshdeskSource,
151
153
  }
152
154
  destinations: Dict[str, Type[DestinationProtocol]] = {
153
155
  "bigquery": BigQueryDestination,
@@ -1,4 +1,4 @@
1
- from typing import Any, Iterator, Optional
1
+ from typing import Any, Iterator
2
2
 
3
3
  import dlt
4
4
  from dlt.common.pendulum import pendulum
@@ -13,25 +13,28 @@ from ingestr.src.frankfurter.helpers import get_path_with_retry
13
13
  max_table_nesting=0,
14
14
  )
15
15
  def frankfurter_source(
16
- table: str,
17
- start_date: Optional[TAnyDateTime] = None,
18
- end_date: Optional[TAnyDateTime] = None,
16
+ start_date: TAnyDateTime,
17
+ end_date: TAnyDateTime,
19
18
  ) -> Any:
20
19
  """
21
20
  A dlt source for the frankfurter.dev API. It groups several resources (in this case frankfurter.dev API endpoints) containing
22
21
  various types of data: currencies, latest rates, historical rates.
23
-
24
- Returns the appropriate resource based on the provided parameters.
25
22
  """
26
- # Determine which resource to return based on the `table` parameter
27
- if table == "currencies":
28
- return currencies()
23
+ date_time = dlt.sources.incremental(
24
+
25
+ "date",
26
+ initial_value=start_date,
27
+ end_value=end_date,
28
+ range_start="closed",
29
+ range_end="closed",
30
+ )
29
31
 
30
- elif table == "latest":
31
- return latest()
32
+ return (
33
+ currencies(),
34
+ latest(),
35
+ exchange_rates(start_date=date_time, end_date=end_date),
32
36
 
33
- elif table == "exchange_rates":
34
- return exchange_rates(start_date=start_date, end_date=end_date)
37
+ )
35
38
 
36
39
 
37
40
  @dlt.resource(
@@ -53,13 +56,13 @@ def currencies() -> Iterator[dict]:
53
56
 
54
57
 
55
58
  @dlt.resource(
56
- write_disposition="replace",
59
+ write_disposition="merge",
57
60
  columns={
58
61
  "date": {"data_type": "text"},
59
- "currency_name": {"data_type": "text"},
62
+ "currency_code": {"data_type": "text"},
60
63
  "rate": {"data_type": "double"},
61
64
  },
62
- primary_key=["date", "currency_name"], # Composite primary key
65
+ primary_key=["date", "currency_code"], # Composite primary key
63
66
  )
64
67
  def latest() -> Iterator[dict]:
65
68
  """
@@ -69,50 +72,54 @@ def latest() -> Iterator[dict]:
69
72
  url = "latest?"
70
73
 
71
74
  # Fetch data
72
- latest_data = get_path_with_retry(url)
75
+ data = get_path_with_retry(url)
73
76
 
74
77
  # Extract rates and base currency
75
- rates = latest_data["rates"]
78
+ rates = data["rates"]
76
79
 
77
- # Prepare the date
78
- date = pendulum.now().to_date_string()
80
+ date = pendulum.parse(data["date"])
79
81
 
80
82
  # Add the base currency (EUR) with a rate of 1.0
81
83
  yield {
82
84
  "date": date,
83
- "currency_name": "EUR",
85
+ "currency_code": "EUR",
84
86
  "rate": 1.0,
85
87
  }
86
88
 
87
89
  # Add all currencies and their rates
88
- for currency_name, rate in rates.items():
90
+ for currency_code, rate in rates.items():
89
91
  yield {
90
92
  "date": date,
91
- "currency_name": currency_name,
93
+ "currency_code": currency_code,
92
94
  "rate": rate,
93
95
  }
94
96
 
95
97
 
96
98
  @dlt.resource(
97
- write_disposition="replace",
99
+ write_disposition="merge",
98
100
  columns={
99
101
  "date": {"data_type": "text"},
100
- "currency_name": {"data_type": "text"},
102
+ "currency_code": {"data_type": "text"},
101
103
  "rate": {"data_type": "double"},
102
104
  },
103
- primary_key=["date", "currency_name"], # Composite primary key
105
+ primary_key=("date", "currency_code"), # Composite primary key
104
106
  )
105
107
  def exchange_rates(
106
- start_date: TAnyDateTime,
107
108
  end_date: TAnyDateTime,
109
+ start_date: dlt.sources.incremental[TAnyDateTime] = dlt.sources.incremental("date"),
108
110
  ) -> Iterator[dict]:
109
111
  """
110
112
  Fetches exchange rates for a specified date range.
111
- If only start_date is provided, fetches data for that date.
113
+ If only start_date is provided, fetches data until now.
112
114
  If both start_date and end_date are provided, fetches data for each day in the range.
113
115
  """
114
- start_date_str = ensure_pendulum_datetime(start_date).format("YYYY-MM-DD")
115
- end_date_str = ensure_pendulum_datetime(end_date).format("YYYY-MM-DD")
116
+ # Ensure start_date.last_value is a pendulum.DateTime object
117
+ start_date_obj = ensure_pendulum_datetime(start_date.last_value) # type: ignore
118
+ start_date_str = start_date_obj.format("YYYY-MM-DD")
119
+
120
+ # Ensure end_date is a pendulum.DateTime object
121
+ end_date_obj = ensure_pendulum_datetime(end_date)
122
+ end_date_str = end_date_obj.format("YYYY-MM-DD")
116
123
 
117
124
  # Compose the URL
118
125
  url = f"{start_date_str}..{end_date_str}?"
@@ -121,22 +128,23 @@ def exchange_rates(
121
128
  data = get_path_with_retry(url)
122
129
 
123
130
  # Extract base currency and rates from the API response
124
- base_currency = data["base"]
125
131
  rates = data["rates"]
126
132
 
127
133
  # Iterate over the rates dictionary (one entry per date)
128
134
  for date, daily_rates in rates.items():
135
+ formatted_date = pendulum.parse(date)
136
+
129
137
  # Add the base currency with a rate of 1.0
130
138
  yield {
131
- "date": date,
132
- "currency_name": base_currency,
139
+ "date": formatted_date,
140
+ "currency_code": "EUR",
133
141
  "rate": 1.0,
134
142
  }
135
143
 
136
144
  # Add all other currencies and their rates
137
- for currency_name, rate in daily_rates.items():
145
+ for currency_code, rate in daily_rates.items():
138
146
  yield {
139
- "date": date,
140
- "currency_name": currency_name,
147
+ "date": formatted_date,
148
+ "currency_code": currency_code,
141
149
  "rate": rate,
142
150
  }
@@ -8,7 +8,7 @@ FRANKFURTER_API_URL = "https://api.frankfurter.dev/v1/"
8
8
 
9
9
 
10
10
  def get_url_with_retry(url: str) -> StrAny:
11
- r = requests.get(url)
11
+ r = requests.get(url, timeout=5)
12
12
  return r.json() # type: ignore
13
13
 
14
14
 
@@ -19,7 +19,7 @@ def get_path_with_retry(path: str) -> StrAny:
19
19
  def validate_dates(start_date: datetime, end_date: datetime) -> None:
20
20
  current_date = pendulum.now()
21
21
 
22
- # Check if start_date is in the future
22
+ # Check if start_date is in the futurep
23
23
  if start_date > current_date:
24
24
  raise ValueError("Interval-start cannot be in the future.")
25
25
 
@@ -0,0 +1,72 @@
1
+ """This source uses Freshdesk API and dlt to load data such as Agents, Companies, Tickets
2
+ etc. to the database"""
3
+
4
+ from typing import Any, Dict, Generator, Iterable, List, Optional
5
+
6
+ import dlt
7
+ from dlt.sources import DltResource
8
+
9
+ from .freshdesk_client import FreshdeskClient
10
+ from .settings import DEFAULT_ENDPOINTS
11
+
12
+
13
+ @dlt.source()
14
+ def freshdesk_source(
15
+ endpoints: Optional[List[str]] = None,
16
+ per_page: int = 100,
17
+ domain: str = dlt.secrets.value,
18
+ api_secret_key: str = dlt.secrets.value,
19
+ ) -> Iterable[DltResource]:
20
+ """
21
+ Retrieves data from specified Freshdesk API endpoints.
22
+
23
+ This source supports pagination and incremental data loading. It fetches data from a list of
24
+ specified endpoints, or defaults to predefined endpoints in 'settings.py'.
25
+
26
+ Args:
27
+ endpoints: A list of Freshdesk API endpoints to fetch. Deafults to 'settings.py'.
28
+ per_page: The number of items to fetch per page, with a maximum of 100.
29
+ domain: The Freshdesk domain from which to fetch the data. Defaults to 'config.toml'.
30
+ api_secret_key: Freshdesk API key. Defaults to 'secrets.toml'.
31
+
32
+ Yields:
33
+ Iterable[DltResource]: Resources with data updated after the last 'updated_at'
34
+ timestamp for each endpoint.
35
+ """
36
+ # Instantiate FreshdeskClient with the provided domain and API key
37
+ freshdesk = FreshdeskClient(api_key=api_secret_key, domain=domain)
38
+
39
+ def incremental_resource(
40
+ endpoint: str,
41
+ updated_at: Optional[Any] = dlt.sources.incremental(
42
+ "updated_at", initial_value="2022-01-01T00:00:00Z"
43
+ ),
44
+ ) -> Generator[Dict[Any, Any], Any, None]:
45
+ """
46
+ Fetches and yields paginated data from a specified API endpoint.
47
+ Each page of data is fetched based on the `updated_at` timestamp
48
+ to ensure incremental loading.
49
+ """
50
+
51
+ # Retrieve the last updated timestamp to fetch only new or updated records.
52
+ if updated_at is not None:
53
+ updated_at = updated_at.last_value
54
+
55
+ # Use the FreshdeskClient instance to fetch paginated responses
56
+ yield from freshdesk.paginated_response(
57
+ endpoint=endpoint,
58
+ per_page=per_page,
59
+ updated_at=updated_at,
60
+ )
61
+
62
+ # Set default endpoints if not provided
63
+ endpoints = endpoints or DEFAULT_ENDPOINTS
64
+
65
+ # For each endpoint, create and yield a DLT resource
66
+ for endpoint in endpoints:
67
+ yield dlt.resource(
68
+ incremental_resource,
69
+ name=endpoint,
70
+ write_disposition="merge",
71
+ primary_key="id",
72
+ )(endpoint=endpoint)
@@ -0,0 +1,102 @@
1
+ """Freshdesk Client for making authenticated requests"""
2
+
3
+ import logging
4
+ import time
5
+ from typing import Any, Dict, Iterable, Optional
6
+
7
+ from dlt.common.typing import TDataItem
8
+ from dlt.sources.helpers import requests
9
+
10
+
11
+ class FreshdeskClient:
12
+ """
13
+ Client for making authenticated requests to the Freshdesk API. It incorporates API requests with
14
+ rate limit and pagination.
15
+
16
+ Attributes:
17
+ api_key (str): The API key used for authenticating requests to the Freshdesk API.
18
+ domain (str): The Freshdesk domain specific to the user, used in constructing the base URL.
19
+ base_url (str): The base URL constructed from the domain, targeting the Freshdesk API v2.
20
+ """
21
+
22
+ def __init__(self, api_key: str, domain: str):
23
+ # Initialize the FreshdeskClient instance with API key and domain.
24
+ # The API key is used for authentication with the Freshdesk API.
25
+ # The domain specifies the unique Freshdesk domain of the user.
26
+
27
+ # Store the API key provided during initialization.
28
+ self.api_key = api_key
29
+ # Store the Freshdesk domain provided during initialization.
30
+ self.domain = domain
31
+
32
+ # Construct the base URL for the API requests.
33
+ # This URL is formed by appending the domain to the standard Freshdesk API base URL format.
34
+ # All API requests will use this base URL as their starting point.
35
+ self.base_url = f"https://{domain}.freshdesk.com/api/v2"
36
+
37
+ def _request_with_rate_limit(self, url: str, **kwargs: Any) -> requests.Response:
38
+ """
39
+ Handles rate limits in HTTP requests and ensures
40
+ that the client doesn't exceed the limit set by the server.
41
+ """
42
+
43
+ while True:
44
+ try:
45
+ response = requests.get(url, **kwargs, auth=(self.api_key, "X"))
46
+ response.raise_for_status()
47
+
48
+ return response
49
+ except requests.HTTPError as e:
50
+ if e.response.status_code == 429:
51
+ # Get the 'Retry-After' header to know how long to wait
52
+ # Fallback to 60 seconds if header is missing
53
+ seconds_to_wait = int(e.response.headers.get("Retry-After", 60))
54
+ # Log a warning message
55
+ logging.warning(
56
+ "Rate limited. Waiting to retry after: %s secs", seconds_to_wait
57
+ )
58
+
59
+ # Wait for the specified number of seconds before retrying
60
+ time.sleep(seconds_to_wait)
61
+ else:
62
+ # If the error is not a rate limit (429), raise the exception to be
63
+ # handled elsewhere or stop execution
64
+ raise
65
+
66
+ def paginated_response(
67
+ self,
68
+ endpoint: str,
69
+ per_page: int,
70
+ updated_at: Optional[str] = None,
71
+ ) -> Iterable[TDataItem]:
72
+ """
73
+ Fetches a paginated response from a specified endpoint.
74
+
75
+ This method will continuously fetch data from the given endpoint,
76
+ page by page, until no more data is available or until it reaches data
77
+ updated at the specified timestamp.
78
+ """
79
+ page = 1
80
+ while True:
81
+ # Construct the URL for the specific endpoint
82
+ url = f"{self.base_url}/{endpoint}"
83
+
84
+ params: Dict[str, Any] = {"per_page": per_page, "page": page}
85
+
86
+ # Implement date range splitting logic here, if applicable
87
+ if endpoint in ["tickets", "contacts"]:
88
+ param_key = (
89
+ "updated_since" if endpoint == "tickets" else "_updated_since"
90
+ )
91
+ if updated_at:
92
+ params[param_key] = updated_at
93
+
94
+ # Handle requests with rate-limiting
95
+ # A maximum of 300 pages (30000 tickets) will be returned.
96
+ response = self._request_with_rate_limit(url, params=params)
97
+ data = response.json()
98
+
99
+ if not data:
100
+ break # Stop if no data or max page limit reached
101
+ yield data
102
+ page += 1
@@ -0,0 +1,9 @@
1
+ """
2
+ This module defines default settings for the Freshdesk integration.
3
+
4
+ It specifies a list of default endpoints to be used when interacting with the Freshdesk API,
5
+ covering common entities such as agents, companies, contacts, groups, roles, and tickets.
6
+ """
7
+
8
+ # Define default endpoints for the Freshdesk API integration.
9
+ DEFAULT_ENDPOINTS = ["agents", "companies", "contacts", "groups", "roles", "tickets"]
@@ -13,9 +13,10 @@ from google.analytics.data_v1beta import BetaAnalyticsDataClient
13
13
  from google.analytics.data_v1beta.types import (
14
14
  Dimension,
15
15
  Metric,
16
+ MinuteRange,
16
17
  )
17
18
 
18
- from .helpers import get_report
19
+ from .helpers import get_realtime_report, get_report
19
20
 
20
21
 
21
22
  @dlt.source(max_table_nesting=0)
@@ -29,6 +30,7 @@ def google_analytics(
29
30
  start_date: Optional[pendulum.DateTime] = pendulum.datetime(2024, 1, 1),
30
31
  end_date: Optional[pendulum.DateTime] = None,
31
32
  rows_per_page: int = 10000,
33
+ minute_range_objects: List[MinuteRange] | None = None,
32
34
  ) -> List[DltResource]:
33
35
  try:
34
36
  property_id = int(property_id)
@@ -58,7 +60,7 @@ def google_analytics(
58
60
  dimensions = query["dimensions"]
59
61
 
60
62
  @dlt.resource(
61
- name="basic_report",
63
+ name="custom",
62
64
  merge_key=datetime_dimension,
63
65
  write_disposition="merge",
64
66
  )
@@ -87,6 +89,22 @@ def google_analytics(
87
89
  end_date=end_date,
88
90
  )
89
91
 
92
+ # real time report
93
+ @dlt.resource(
94
+ name="realtime",
95
+ merge_key="ingested_at",
96
+ write_disposition="merge",
97
+ )
98
+ def real_time_report() -> Iterator[TDataItem]:
99
+ yield from get_realtime_report(
100
+ client=client,
101
+ property_id=property_id,
102
+ dimension_list=[Dimension(name=dimension) for dimension in dimensions],
103
+ metric_list=[Metric(name=metric) for metric in query["metrics"]],
104
+ per_page=rows_per_page,
105
+ minute_range_objects=minute_range_objects,
106
+ )
107
+
90
108
  # res = dlt.resource(
91
109
  # basic_report, name="basic_report", merge_key=datetime_dimension, write_disposition="merge"
92
110
  # )(
@@ -103,4 +121,4 @@ def google_analytics(
103
121
  # ),
104
122
  # )
105
123
 
106
- return [basic_report]
124
+ return [basic_report, real_time_report]
@@ -2,8 +2,10 @@
2
2
  This module contains helpers that process data and make it ready for loading into the database
3
3
  """
4
4
 
5
+ import base64
5
6
  import json
6
7
  from typing import Any, Iterator, List, Union
8
+ from urllib.parse import parse_qs, urlparse
7
9
 
8
10
  import proto
9
11
  from dlt.common.exceptions import MissingDependencyException
@@ -22,6 +24,8 @@ try:
22
24
  Metric,
23
25
  MetricMetadata, # noqa: F401
24
26
  MetricType,
27
+ MinuteRange,
28
+ RunRealtimeReportRequest,
25
29
  RunReportRequest,
26
30
  RunReportResponse,
27
31
  )
@@ -52,6 +56,53 @@ def to_dict(item: Any) -> Iterator[TDataItem]:
52
56
  yield item
53
57
 
54
58
 
59
+ def get_realtime_report(
60
+ client: Resource,
61
+ property_id: int,
62
+ dimension_list: List[Dimension],
63
+ metric_list: List[Metric],
64
+ per_page: int,
65
+ minute_range_objects: List[MinuteRange] | None = None,
66
+ ) -> Iterator[TDataItem]:
67
+ """
68
+ Gets all the possible pages of reports with the given query parameters.
69
+ Processes every page and yields a dictionary for every row of the report.
70
+
71
+ Args:
72
+ client: The Google Analytics client used to make requests.
73
+ property_id: A reference to the Google Analytics project.
74
+ More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
75
+ dimension_list: A list of all the dimensions requested in the query.
76
+ metric_list: A list of all the metrics requested in the query.
77
+ limit: Describes how many rows there should be per page.
78
+
79
+ Yields:
80
+ Generator of all rows of data in the report.
81
+ """
82
+ offset = 0
83
+ ingest_at = pendulum.now().to_date_string()
84
+
85
+ while True:
86
+ request = RunRealtimeReportRequest(
87
+ property=f"properties/{property_id}",
88
+ dimensions=dimension_list,
89
+ metrics=metric_list,
90
+ limit=per_page,
91
+ minute_ranges=minute_range_objects if minute_range_objects else None,
92
+ )
93
+ response = client.run_realtime_report(request)
94
+
95
+ # process request
96
+ processed_response_generator = process_report(
97
+ response=response, ingest_at=ingest_at
98
+ )
99
+ # import pdb; pdb.set_trace()
100
+ yield from processed_response_generator
101
+ offset += per_page
102
+ if len(response.rows) < per_page or offset > 1000000:
103
+ break
104
+
105
+
55
106
  def get_report(
56
107
  client: Resource,
57
108
  property_id: int,
@@ -79,10 +130,6 @@ def get_report(
79
130
  Generator of all rows of data in the report.
80
131
  """
81
132
 
82
- print(
83
- "fetching for daterange", start_date.to_date_string(), end_date.to_date_string()
84
- )
85
-
86
133
  offset = 0
87
134
  while True:
88
135
  request = RunReportRequest(
@@ -98,9 +145,11 @@ def get_report(
98
145
  )
99
146
  ],
100
147
  )
101
- # process request
102
148
  response = client.run_report(request)
149
+
150
+ # process request
103
151
  processed_response_generator = process_report(response=response)
152
+
104
153
  # import pdb; pdb.set_trace()
105
154
  yield from processed_response_generator
106
155
  offset += per_page
@@ -108,7 +157,9 @@ def get_report(
108
157
  break
109
158
 
110
159
 
111
- def process_report(response: RunReportResponse) -> Iterator[TDataItems]:
160
+ def process_report(
161
+ response: RunReportResponse, ingest_at: str | None = None
162
+ ) -> Iterator[TDataItems]:
112
163
  metrics_headers = [header.name for header in response.metric_headers]
113
164
  dimensions_headers = [header.name for header in response.dimension_headers]
114
165
 
@@ -131,6 +182,8 @@ def process_report(response: RunReportResponse) -> Iterator[TDataItems]:
131
182
  metric_type=metric_type, value=row.metric_values[i].value
132
183
  )
133
184
  response_dict[metrics_headers[i]] = metric_value
185
+ if ingest_at is not None:
186
+ response_dict["ingested_at"] = ingest_at
134
187
 
135
188
  unique_key = "-".join(list(response_dict.keys()))
136
189
  if unique_key not in distinct_key_combinations:
@@ -170,3 +223,65 @@ def _resolve_dimension_value(dimension_name: str, dimension_value: str) -> Any:
170
223
  return pendulum.from_format(dimension_value, "YYYYMMDDHHmm", tz="UTC")
171
224
  else:
172
225
  return dimension_value
226
+
227
+
228
+ def convert_minutes_ranges_to_minute_range_objects(minutes_ranges: str) -> List[MinuteRange]:
229
+ minutes_ranges = minutes_ranges.strip()
230
+ minutes = minutes_ranges.replace(" ", "").split(",")
231
+ if minutes == "":
232
+ raise ValueError(
233
+ "Invalid input. Minutes range should be startminute-endminute format. For example: 1-2,5-6"
234
+ )
235
+
236
+
237
+ minute_range_objects = []
238
+ for min_range in minutes:
239
+ if "-" not in min_range:
240
+ raise ValueError(
241
+ "Invalid input. Minutes range should be startminute-endminute format. For example: 1-2,5-6"
242
+ )
243
+ parts = min_range.split("-")
244
+
245
+ if not parts[0].isdigit() or not parts[1].isdigit():
246
+ raise ValueError(
247
+ f"Invalid input '{min_range}'. Both start and end minutes must be digits. For example: 1-2,5-6"
248
+ )
249
+
250
+ end_minutes_ago = int(parts[0])
251
+ start_minutes_ago = int(parts[1])
252
+ minute_range_objects.append(MinuteRange(
253
+ name=f"{end_minutes_ago}-{start_minutes_ago} minutes ago",
254
+ start_minutes_ago= start_minutes_ago,
255
+ end_minutes_ago=end_minutes_ago
256
+ ))
257
+
258
+ return minute_range_objects
259
+
260
+
261
+ def parse_google_analytics_uri(uri: str):
262
+ parse_uri = urlparse(uri)
263
+ source_fields = parse_qs(parse_uri.query)
264
+ cred_path = source_fields.get("credentials_path")
265
+ cred_base64 = source_fields.get("credentials_base64")
266
+
267
+ if not cred_path and not cred_base64:
268
+ raise ValueError(
269
+ "credentials_path or credentials_base64 is required to connect Google Analytics"
270
+ )
271
+ credentials = {}
272
+ if cred_path:
273
+ with open(cred_path[0], "r") as f:
274
+ credentials = json.load(f)
275
+ elif cred_base64:
276
+ credentials = json.loads(base64.b64decode(cred_base64[0]).decode("utf-8"))
277
+
278
+ property_id = source_fields.get("property_id")
279
+ if not property_id:
280
+ raise ValueError("property_id is required to connect to Google Analytics")
281
+
282
+ if (not cred_path and not cred_base64) or (not property_id):
283
+ raise ValueError(
284
+ "credentials_path or credentials_base64 and property_id are required to connect Google Analytics"
285
+ )
286
+
287
+ return {"credentials": credentials, "property_id": property_id[0]}
ingestr/src/sources.py CHANGED
@@ -852,22 +852,31 @@ class AirtableSource:
852
852
  if not table:
853
853
  raise ValueError("Source table is required to connect to Airtable")
854
854
 
855
- tables = table.split(",")
856
-
857
855
  source_parts = urlparse(uri)
858
856
  source_fields = parse_qs(source_parts.query)
859
- base_id = source_fields.get("base_id")
860
857
  access_token = source_fields.get("access_token")
861
858
 
862
- if not base_id or not access_token:
859
+ if not access_token:
863
860
  raise ValueError(
864
- "base_id and access_token in the URI are required to connect to Airtable"
861
+ "access_token in the URI is required to connect to Airtable"
865
862
  )
866
863
 
864
+ base_id = source_fields.get("base_id", [None])[0]
865
+ clean_table = table
866
+
867
+ table_fields = table.split("/")
868
+ if len(table_fields) == 2:
869
+ clean_table = table_fields[1]
870
+ if not base_id:
871
+ base_id = table_fields[0]
872
+
873
+ if not base_id:
874
+ raise ValueError("base_id in the URI is required to connect to Airtable")
875
+
867
876
  from ingestr.src.airtable import airtable_source
868
877
 
869
878
  return airtable_source(
870
- base_id=base_id[0], table_names=tables, access_token=access_token[0]
879
+ base_id=base_id, table_names=[clean_table], access_token=access_token[0]
871
880
  )
872
881
 
873
882
 
@@ -1460,48 +1469,49 @@ class GoogleAnalyticsSource:
1460
1469
  return True
1461
1470
 
1462
1471
  def dlt_source(self, uri: str, table: str, **kwargs):
1463
- parse_uri = urlparse(uri)
1464
- source_fields = parse_qs(parse_uri.query)
1465
- cred_path = source_fields.get("credentials_path")
1466
- cred_base64 = source_fields.get("credentials_base64")
1472
+ import ingestr.src.google_analytics.helpers as helpers
1473
+
1474
+ result = helpers.parse_google_analytics_uri(uri)
1475
+ credentials = result["credentials"]
1476
+ property_id = result["property_id"]
1467
1477
 
1468
- if not cred_path and not cred_base64:
1478
+ fields = table.split(":")
1479
+ if len(fields) != 3 and len(fields) != 4:
1469
1480
  raise ValueError(
1470
- "credentials_path or credentials_base64 is required to connect Google Analytics"
1481
+ "Invalid table format. Expected format: <report_type>:<dimensions>:<metrics> or <report_type>:<dimensions>:<metrics>:<minute_ranges>"
1471
1482
  )
1472
1483
 
1473
- credentials = {}
1474
- if cred_path:
1475
- with open(cred_path[0], "r") as f:
1476
- credentials = json.load(f)
1477
- elif cred_base64:
1478
- credentials = json.loads(base64.b64decode(cred_base64[0]).decode("utf-8"))
1479
-
1480
- property_id = source_fields.get("property_id")
1481
- if not property_id:
1482
- raise ValueError("property_id is required to connect to Google Analytics")
1483
-
1484
- fields = table.split(":")
1485
- if len(fields) != 3:
1484
+ report_type = fields[0]
1485
+ if report_type not in ["custom", "realtime"]:
1486
1486
  raise ValueError(
1487
- "Invalid table format. Expected format: custom:<dimensions>:<metrics>"
1487
+ "Invalid report type. Expected format: <report_type>:<dimensions>:<metrics>. Available report types: custom, realtime"
1488
1488
  )
1489
1489
 
1490
1490
  dimensions = fields[1].replace(" ", "").split(",")
1491
+ metrics = fields[2].replace(" ", "").split(",")
1492
+
1493
+ minute_range_objects = []
1494
+ if len(fields) == 4:
1495
+ minute_range_objects = helpers.convert_minutes_ranges_to_minute_range_objects(fields[3])
1491
1496
 
1492
1497
  datetime = ""
1493
- for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
1494
- if dimension_datetime in dimensions:
1495
- datetime = dimension_datetime
1496
- break
1497
- else:
1498
- raise ValueError(
1499
- "You must provide at least one dimension: [dateHour, dateHourMinute, date]"
1500
- )
1498
+ resource_name = fields[0].lower()
1499
+ if resource_name == "custom":
1500
+ for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
1501
+ if dimension_datetime in dimensions:
1502
+ datetime = dimension_datetime
1503
+ break
1504
+ else:
1505
+ raise ValueError(
1506
+ "You must provide at least one dimension: [dateHour, dateHourMinute, date]"
1507
+ )
1501
1508
 
1502
- metrics = fields[2].replace(" ", "").split(",")
1503
1509
  queries = [
1504
- {"resource_name": "custom", "dimensions": dimensions, "metrics": metrics}
1510
+ {
1511
+ "resource_name": resource_name,
1512
+ "dimensions": dimensions,
1513
+ "metrics": metrics,
1514
+ }
1505
1515
  ]
1506
1516
 
1507
1517
  start_date = pendulum.now().subtract(days=30).start_of("day")
@@ -1515,13 +1525,14 @@ class GoogleAnalyticsSource:
1515
1525
  from ingestr.src.google_analytics import google_analytics
1516
1526
 
1517
1527
  return google_analytics(
1518
- property_id=property_id[0],
1528
+ property_id=property_id,
1519
1529
  start_date=start_date,
1520
1530
  end_date=end_date,
1521
1531
  datetime_dimension=datetime,
1522
1532
  queries=queries,
1523
1533
  credentials=credentials,
1524
- ).with_resources("basic_report")
1534
+ minute_range_objects=minute_range_objects if minute_range_objects else None,
1535
+ ).with_resources(resource_name)
1525
1536
 
1526
1537
 
1527
1538
  class GitHubSource:
@@ -2164,36 +2175,60 @@ class FrankfurterSource:
2164
2175
  return True
2165
2176
 
2166
2177
  def dlt_source(self, uri: str, table: str, **kwargs):
2167
- # start and end dates only assigned and validated for exchange_rates table
2168
- # Note: if an end date but no start date is provided, start date and end date will be set to current date
2169
- from ingestr.src.frankfurter import frankfurter_source
2170
- from ingestr.src.frankfurter.helpers import validate_dates
2178
+ if kwargs.get("incremental_key"):
2179
+ raise ValueError(
2180
+ "Frankfurter takes care of incrementality on its own, you should not provide incremental_key"
2181
+ )
2171
2182
 
2172
- if table == "exchange_rates":
2173
- if kwargs.get("interval_start"):
2174
- start_date = ensure_pendulum_datetime(str(kwargs.get("interval_start")))
2175
- if kwargs.get("interval_end"):
2176
- end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
2177
- else:
2178
- end_date = start_date
2183
+ if kwargs.get("interval_start"):
2184
+ start_date = ensure_pendulum_datetime(str(kwargs.get("interval_start")))
2185
+ if kwargs.get("interval_end"):
2186
+ end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
2179
2187
  else:
2180
- start_date = pendulum.now()
2181
2188
  end_date = pendulum.now()
2182
- validate_dates(start_date=start_date, end_date=end_date)
2183
-
2184
- # For currencies and latest tables, set start and end dates to current date
2185
2189
  else:
2186
2190
  start_date = pendulum.now()
2187
2191
  end_date = pendulum.now()
2188
2192
 
2189
- # Validate table
2190
- if table not in ["currencies", "latest", "exchange_rates"]:
2191
- raise ValueError(
2192
- f"Table '{table}' is not supported for Frankfurter source."
2193
- )
2193
+ from ingestr.src.frankfurter import frankfurter_source
2194
+ from ingestr.src.frankfurter.helpers import validate_dates
2195
+
2196
+ validate_dates(start_date=start_date, end_date=end_date)
2194
2197
 
2195
- return frankfurter_source(
2196
- table=table,
2198
+ src = frankfurter_source(
2197
2199
  start_date=start_date,
2198
2200
  end_date=end_date,
2199
2201
  )
2202
+
2203
+ if table not in src.resources:
2204
+ raise UnsupportedResourceError(table, "Frankfurter")
2205
+
2206
+ return src.with_resources(table)
2207
+
2208
+ class FreshdeskSource:
2209
+ # freshdesk://domain?api_key=<api_key>
2210
+ def handles_incrementality(self) -> bool:
2211
+ return True
2212
+
2213
+ def dlt_source(self, uri: str, table: str, **kwargs):
2214
+ parsed_uri = urlparse(uri)
2215
+ domain = parsed_uri.netloc
2216
+ query = parsed_uri.query
2217
+ params = parse_qs(query)
2218
+
2219
+ if not domain:
2220
+ raise MissingValueError("domain", "Freshdesk")
2221
+
2222
+ if '.' in domain:
2223
+ domain = domain.split('.')[0]
2224
+
2225
+ api_key = params.get("api_key")
2226
+ if api_key is None:
2227
+ raise MissingValueError("api_key", "Freshdesk")
2228
+
2229
+ if table not in ["agents", "companies", "contacts", "groups", "roles", "tickets"]:
2230
+ raise UnsupportedResourceError(table, "Freshdesk")
2231
+
2232
+ from ingestr.src.freshdesk import freshdesk_source
2233
+ return freshdesk_source(api_secret_key=api_key[0], domain=domain).with_resources(table)
2234
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestr
3
- Version: 0.13.34
3
+ Version: 0.13.36
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -46,7 +46,7 @@ Requires-Dist: databricks-sqlalchemy==1.0.2
46
46
  Requires-Dist: dataclasses-json==0.6.7
47
47
  Requires-Dist: decorator==5.2.1
48
48
  Requires-Dist: deprecation==2.1.0
49
- Requires-Dist: dlt==1.9.0
49
+ Requires-Dist: dlt==1.10.0
50
50
  Requires-Dist: dnspython==2.7.0
51
51
  Requires-Dist: duckdb-engine==0.17.0
52
52
  Requires-Dist: duckdb==1.2.1
@@ -2,21 +2,21 @@ ingestr/conftest.py,sha256=Q03FIJIZpLBbpj55cfCHIKEjc1FCvWJhMF2cidUJKQU,1748
2
2
  ingestr/main.py,sha256=mRlGSqi2sHcZ2AKlwn5MqoMvFxXlSjcZxmPJr76rmRk,25187
3
3
  ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
4
4
  ingestr/src/blob.py,sha256=onMe5ZHxPXTdcB_s2oGNdMo-XQJ3ajwOsWE9eSTGFmc,1495
5
- ingestr/src/buildinfo.py,sha256=i3Tz80qXUH6VzMC8jzlySZd05zyoaaBcvoyLd2q-wKg,21
6
- ingestr/src/destinations.py,sha256=0fEwLY78SQDXbHcX4iz4Xc7H8FXN-QhVJL9uoUTZOs4,12924
5
+ ingestr/src/buildinfo.py,sha256=abX1HXd_dkzG2hkJg7JdFGSvgjGi72VrEucHcTxIziA,21
6
+ ingestr/src/destinations.py,sha256=Z79f01BSmEaXnQno2IQVt4Th4dmD-BiOQXlibZJ5sTw,13180
7
7
  ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
8
- ingestr/src/factory.py,sha256=659h_sVRBhtPv2dvtOK8tf3PtUhlK3KsWLrb20_iQKw,5333
8
+ ingestr/src/factory.py,sha256=M0FAes6KsvqCzuTnUBcxc6DF7UVO51IlrFxy2VDpbkQ,5392
9
9
  ingestr/src/filters.py,sha256=C-_TIVkF_cxZBgG-Run2Oyn0TAhJgA8IWXZ-OPY3uek,1136
10
10
  ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
11
11
  ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
12
12
  ingestr/src/resource.py,sha256=XG-sbBapFVEM7OhHQFQRTdTLlh-mHB-N4V1t8F8Tsww,543
13
- ingestr/src/sources.py,sha256=uRERygJ41y0MNXF3-FJvHr4btxlEM93ZeWr_Liz3N2M,76181
13
+ ingestr/src/sources.py,sha256=YpaUS5Ui-YXeZYLETPAj60WhU5fWI_lP6jVA0w6J6qo,77250
14
14
  ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
15
15
  ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
16
16
  ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
17
17
  ingestr/src/adjust/__init__.py,sha256=ULjtJqrNS6XDvUyGl0tjl12-tLyXlCgeFe2icTbtu3Q,3255
18
18
  ingestr/src/adjust/adjust_helpers.py,sha256=IHSS94A7enOWkZ8cP5iW3RdYt0Xl3qZGAmDc1Xy4qkI,3802
19
- ingestr/src/airtable/__init__.py,sha256=mdzeaq0g12HR8gbhtVR_aS_5GVWPZn6XD-zHUE5FunI,2788
19
+ ingestr/src/airtable/__init__.py,sha256=XzRsS39xszUlh_s7P1_zq5v8vLfjz3m-NtTPaa8TTZU,2818
20
20
  ingestr/src/applovin/__init__.py,sha256=X_YCLppPrnL8KXfYWICE_uDfMzHHH3JZ-DBGZ1RlaOI,6984
21
21
  ingestr/src/applovin_max/__init__.py,sha256=ZrxOUSirGxkGDmM9wsQO3anwNVzqtoCwN_OuCXfPkXE,3285
22
22
  ingestr/src/appsflyer/__init__.py,sha256=QoK-B3cYYMD3bqzQaLWNH6FkJyjRbzRkBF2n6urxubs,8071
@@ -42,8 +42,11 @@ ingestr/src/facebook_ads/settings.py,sha256=1IxZeP_4rN3IBvAncNHOoqpzAirx0Hz-MUK_
42
42
  ingestr/src/filesystem/__init__.py,sha256=zkIwbRr0ir0EUdniI25p2zGiVc-7M9EmR351AjNb0eA,4163
43
43
  ingestr/src/filesystem/helpers.py,sha256=bg0muSHZr3hMa8H4jN2-LGWzI-SUoKlQNiWJ74-YYms,3211
44
44
  ingestr/src/filesystem/readers.py,sha256=a0fKkaRpnAOGsXI3EBNYZa7x6tlmAOsgRzb883StY30,3987
45
- ingestr/src/frankfurter/__init__.py,sha256=xJUicENGYtOPsGznKP8IA_5Jt-_gJP29onrByBgUf-g,4259
46
- ingestr/src/frankfurter/helpers.py,sha256=RSqI-WAAJfunWnLqiBRmPuonRg7rDOqmY76beb8a6rM,967
45
+ ingestr/src/frankfurter/__init__.py,sha256=sjxfq377-lryuFC3JswcbHBRoBjLnGLKNRTwBpDZyLw,4403
46
+ ingestr/src/frankfurter/helpers.py,sha256=wqm087QVPcyTuMl6yj_Pl1wcuqElwcBMPz3P4773wcM,979
47
+ ingestr/src/freshdesk/__init__.py,sha256=uFQW_cJyymxtHQiYb_xjzZAklc487L0n9GkgHgC7yAI,2618
48
+ ingestr/src/freshdesk/freshdesk_client.py,sha256=3z5Yc008ADzRcJWtNc00PwjkLzG-RMI8jVIOOyYA-Rw,4088
49
+ ingestr/src/freshdesk/settings.py,sha256=0Wr_OMnUZcTlry7BmALssLxD2yh686JW4moLNv12Jnw,409
47
50
  ingestr/src/github/__init__.py,sha256=xVijF-Wi4p88hkVJnKH-oTixismjD3aUcGqGa6Wr4e4,5889
48
51
  ingestr/src/github/helpers.py,sha256=rpv_3HzuOl4PQ-FUeA66pev-pgze9SaE8RUHIPYfZ_A,6759
49
52
  ingestr/src/github/queries.py,sha256=W34C02jUEdjFmOE7f7u9xvYyBNDMfVZAu0JIRZI2mkU,2302
@@ -53,8 +56,8 @@ ingestr/src/google_ads/field.py,sha256=uc8KEaYQrwgQoQPUdxIQWZxpFeZHbiV98FM0ZSael
53
56
  ingestr/src/google_ads/metrics.py,sha256=tAqpBpm-8l95oPT9cBxMWaEoDTNHVXnqUphYDHWKDiE,12099
54
57
  ingestr/src/google_ads/predicates.py,sha256=K4wTuqfmJ9ko1RKeHTBDfQO_mUADVyuRqtywBPP-72w,683
55
58
  ingestr/src/google_ads/reports.py,sha256=AVY1pPt5yaIFskQe1k5VW2Dhlux3bzewsHlDrdGEems,12686
56
- ingestr/src/google_analytics/__init__.py,sha256=8Evpmoy464YpNbCI_NmvFHIzWCu7J7SjJw-RrPZ6AL8,3674
57
- ingestr/src/google_analytics/helpers.py,sha256=vLmFyQ_IEJEK5LlxBJQeJw0VHaE5gRRZdBa54U72CaQ,5965
59
+ ingestr/src/google_analytics/__init__.py,sha256=8b9CBWJFrBpHVRl993Z7J01sKKbYyXEtngdfEUwqlfE,4343
60
+ ingestr/src/google_analytics/helpers.py,sha256=bUTPp5C-k5wqq-ccEAn-asRH2CLbBS2SOs1v9wiRU6U,10087
58
61
  ingestr/src/google_sheets/README.md,sha256=wFQhvmGpRA38Ba2N_WIax6duyD4c7c_pwvvprRfQDnw,5470
59
62
  ingestr/src/google_sheets/__init__.py,sha256=CL0HfY74uxX8-ge0ucI0VhWMYZVAfoX7WRPBitRi-CI,6647
60
63
  ingestr/src/google_sheets/helpers/__init__.py,sha256=5hXZrZK8cMO3UOuL-s4OKOpdACdihQD0hYYlSEu-iQ8,35
@@ -122,8 +125,8 @@ ingestr/testdata/delete_insert_part2.csv,sha256=B_KUzpzbNdDY_n7wWop1mT2cz36TmayS
122
125
  ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ7ZqYN0,276
123
126
  ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
124
127
  ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
125
- ingestr-0.13.34.dist-info/METADATA,sha256=84NPfN9LSTGrw79p3116CXH9BZGjnXgEvglsXpVhEY0,13574
126
- ingestr-0.13.34.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
127
- ingestr-0.13.34.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
128
- ingestr-0.13.34.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
129
- ingestr-0.13.34.dist-info/RECORD,,
128
+ ingestr-0.13.36.dist-info/METADATA,sha256=AFJ4qtGMrtaG5luUcRCXAsp7yP8FKlL4EjP8KorvXKI,13575
129
+ ingestr-0.13.36.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
130
+ ingestr-0.13.36.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
131
+ ingestr-0.13.36.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
132
+ ingestr-0.13.36.dist-info/RECORD,,