ingestr 0.13.78__py3-none-any.whl → 0.13.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main.py CHANGED
@@ -1,3 +1,4 @@
1
+ import warnings
1
2
  from datetime import datetime
2
3
  from enum import Enum
3
4
  from typing import Optional
@@ -8,6 +9,14 @@ from typing_extensions import Annotated
8
9
 
9
10
  from ingestr.src.telemetry.event import track
10
11
 
12
+ try:
13
+ from duckdb_engine import DuckDBEngineWarning
14
+
15
+ warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
16
+ except ImportError:
17
+ # duckdb-engine not installed
18
+ pass
19
+
11
20
  app = typer.Typer(
12
21
  name="ingestr",
13
22
  help="ingestr is the CLI tool to ingest data from one source to another",
@@ -506,7 +515,6 @@ def ingest(
506
515
 
507
516
  if factory.source_scheme == "sqlite":
508
517
  source_table = "main." + source_table.split(".")[-1]
509
-
510
518
 
511
519
  if (
512
520
  incremental_key
@@ -600,10 +608,9 @@ def ingest(
600
608
  if factory.source_scheme == "influxdb":
601
609
  if primary_key:
602
610
  write_disposition = "merge"
603
-
604
611
 
605
612
  start_time = datetime.now()
606
-
613
+
607
614
  run_info: LoadInfo = pipeline.run(
608
615
  dlt_source,
609
616
  **destination.dlt_run_params(
ingestr/src/buildinfo.py CHANGED
@@ -1 +1 @@
1
- version = "v0.13.78"
1
+ version = "v0.13.80"
@@ -147,6 +147,24 @@ class DuckDBDestination(GenericSqlDestination):
147
147
  return dlt.destinations.duckdb(uri, **kwargs)
148
148
 
149
149
 
150
+ class MotherduckDestination(GenericSqlDestination):
151
+ def dlt_dest(self, uri: str, **kwargs):
152
+ from urllib.parse import parse_qs, urlparse
153
+
154
+ parsed = urlparse(uri)
155
+ query = parse_qs(parsed.query)
156
+ token = query.get("token", [None])[0]
157
+ from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
158
+
159
+ creds = {
160
+ "password": token,
161
+ }
162
+ if parsed.path.lstrip("/"):
163
+ creds["database"] = parsed.path.lstrip("/")
164
+
165
+ return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
166
+
167
+
150
168
  def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
151
169
  # ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
152
170
  tup = struct.unpack(
@@ -26,7 +26,6 @@ from .settings import (
26
26
  DEFAULT_LEAD_FIELDS,
27
27
  INSIGHT_FIELDS_TYPES,
28
28
  INSIGHTS_BREAKDOWNS_OPTIONS,
29
- INSIGHTS_PRIMARY_KEY,
30
29
  INVALID_INSIGHTS_FIELDS,
31
30
  TInsightsBreakdownOptions,
32
31
  TInsightsLevels,
@@ -118,6 +117,9 @@ def facebook_insights_source(
118
117
  app_api_version: str = None,
119
118
  start_date: pendulum.DateTime | None = None,
120
119
  end_date: pendulum.DateTime | None = None,
120
+ insights_max_wait_to_finish_seconds: int = 60 * 60 * 4,
121
+ insights_max_wait_to_start_seconds: int = 60 * 30,
122
+ insights_max_async_sleep_seconds: int = 20,
121
123
  ) -> DltResource:
122
124
  """Incrementally loads insight reports with defined granularity level, fields, breakdowns etc.
123
125
 
@@ -207,7 +209,9 @@ def facebook_insights_source(
207
209
  }
208
210
  job = execute_job(
209
211
  account.get_insights(params=query, is_async=True),
210
- insights_max_async_sleep_seconds=20,
212
+ insights_max_async_sleep_seconds=insights_max_async_sleep_seconds,
213
+ insights_max_wait_to_finish_seconds=insights_max_wait_to_finish_seconds,
214
+ insights_max_wait_to_start_seconds=insights_max_wait_to_start_seconds,
211
215
  )
212
216
  output = list(map(process_report_item, job.get_result()))
213
217
  yield output
@@ -144,7 +144,7 @@ def execute_job(
144
144
  raise InsightsJobTimeout(
145
145
  "facebook_insights",
146
146
  pretty_error_message.format(
147
- job_id, insights_max_wait_to_finish_seconds // 60
147
+ job_id, insights_max_wait_to_finish_seconds
148
148
  ),
149
149
  )
150
150
 
ingestr/src/factory.py CHANGED
@@ -12,6 +12,7 @@ from ingestr.src.destinations import (
12
12
  DatabricksDestination,
13
13
  DuckDBDestination,
14
14
  GCSDestination,
15
+ MotherduckDestination,
15
16
  MsSQLDestination,
16
17
  MySqlDestination,
17
18
  PostgresDestination,
@@ -85,6 +86,8 @@ SQL_SOURCE_SCHEMES = [
85
86
  "mysql",
86
87
  "mysql+pymysql",
87
88
  "mysql+mysqlconnector",
89
+ "md",
90
+ "motherduck",
88
91
  "postgres",
89
92
  "postgresql",
90
93
  "postgresql+psycopg2",
@@ -195,6 +198,8 @@ class SourceDestinationFactory:
195
198
  "cratedb": CrateDBDestination,
196
199
  "databricks": DatabricksDestination,
197
200
  "duckdb": DuckDBDestination,
201
+ "motherduck": MotherduckDestination,
202
+ "md": MotherduckDestination,
198
203
  "mssql": MsSQLDestination,
199
204
  "postgres": PostgresDestination,
200
205
  "postgresql": PostgresDestination,
@@ -4,6 +4,8 @@ etc. to the database"""
4
4
  from typing import Any, Dict, Generator, Iterable, List, Optional
5
5
 
6
6
  import dlt
7
+ import pendulum
8
+ from dlt.common.time import ensure_pendulum_datetime
7
9
  from dlt.sources import DltResource
8
10
 
9
11
  from .freshdesk_client import FreshdeskClient
@@ -12,10 +14,12 @@ from .settings import DEFAULT_ENDPOINTS
12
14
 
13
15
  @dlt.source()
14
16
  def freshdesk_source(
15
- endpoints: Optional[List[str]] = None,
17
+ domain: str,
18
+ api_secret_key: str,
19
+ start_date: pendulum.DateTime,
20
+ end_date: Optional[pendulum.DateTime] = None,
16
21
  per_page: int = 100,
17
- domain: str = dlt.secrets.value,
18
- api_secret_key: str = dlt.secrets.value,
22
+ endpoints: Optional[List[str]] = None,
19
23
  ) -> Iterable[DltResource]:
20
24
  """
21
25
  Retrieves data from specified Freshdesk API endpoints.
@@ -39,7 +43,11 @@ def freshdesk_source(
39
43
  def incremental_resource(
40
44
  endpoint: str,
41
45
  updated_at: Optional[Any] = dlt.sources.incremental(
42
- "updated_at", initial_value="2022-01-01T00:00:00Z"
46
+ "updated_at",
47
+ initial_value=start_date.isoformat(),
48
+ end_value=end_date.isoformat() if end_date else None,
49
+ range_start="closed",
50
+ range_end="closed",
43
51
  ),
44
52
  ) -> Generator[Dict[Any, Any], Any, None]:
45
53
  """
@@ -48,15 +56,22 @@ def freshdesk_source(
48
56
  to ensure incremental loading.
49
57
  """
50
58
 
51
- # Retrieve the last updated timestamp to fetch only new or updated records.
52
- if updated_at is not None:
53
- updated_at = updated_at.last_value
59
+ if updated_at.last_value is not None:
60
+ start_date = ensure_pendulum_datetime(updated_at.last_value)
61
+ else:
62
+ start_date = start_date
63
+
64
+ if updated_at.end_value is not None:
65
+ end_date = ensure_pendulum_datetime(updated_at.end_value)
66
+ else:
67
+ end_date = pendulum.now(tz="UTC")
54
68
 
55
69
  # Use the FreshdeskClient instance to fetch paginated responses
56
70
  yield from freshdesk.paginated_response(
57
71
  endpoint=endpoint,
58
72
  per_page=per_page,
59
- updated_at=updated_at,
73
+ start_date=start_date,
74
+ end_date=end_date,
60
75
  )
61
76
 
62
77
  # Set default endpoints if not provided
@@ -2,8 +2,9 @@
2
2
 
3
3
  import logging
4
4
  import time
5
- from typing import Any, Dict, Iterable, Optional
5
+ from typing import Any, Dict, Iterable
6
6
 
7
+ import pendulum
7
8
  from dlt.common.typing import TDataItem
8
9
  from dlt.sources.helpers import requests
9
10
 
@@ -67,7 +68,8 @@ class FreshdeskClient:
67
68
  self,
68
69
  endpoint: str,
69
70
  per_page: int,
70
- updated_at: Optional[str] = None,
71
+ start_date: pendulum.DateTime,
72
+ end_date: pendulum.DateTime,
71
73
  ) -> Iterable[TDataItem]:
72
74
  """
73
75
  Fetches a paginated response from a specified endpoint.
@@ -88,8 +90,8 @@ class FreshdeskClient:
88
90
  param_key = (
89
91
  "updated_since" if endpoint == "tickets" else "_updated_since"
90
92
  )
91
- if updated_at:
92
- params[param_key] = updated_at
93
+
94
+ params[param_key] = start_date.to_iso8601_string()
93
95
 
94
96
  # Handle requests with rate-limiting
95
97
  # A maximum of 300 pages (30000 tickets) will be returned.
@@ -98,5 +100,14 @@ class FreshdeskClient:
98
100
 
99
101
  if not data:
100
102
  break # Stop if no data or max page limit reached
101
- yield data
103
+
104
+ filtered_data = [
105
+ item
106
+ for item in data
107
+ if "updated_at" in item
108
+ and pendulum.parse(item["updated_at"]) <= end_date
109
+ ]
110
+ if not filtered_data:
111
+ break
112
+ yield filtered_data
102
113
  page += 1
@@ -91,7 +91,9 @@ def github_repo_events(
91
91
  """
92
92
 
93
93
  # use naming function in table name to generate separate tables for each event
94
- @dlt.resource(primary_key= "id", table_name=lambda i: i["type"], write_disposition="merge")
94
+ @dlt.resource(
95
+ primary_key="id", table_name=lambda i: i["type"], write_disposition="merge"
96
+ )
95
97
  def repo_events(
96
98
  last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
97
99
  "created_at",
@@ -105,7 +107,7 @@ def github_repo_events(
105
107
  repos_path = (
106
108
  f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
107
109
  )
108
-
110
+
109
111
  # Get the date range from the incremental state
110
112
  start_filter = pendulum.parse(
111
113
  last_created_at.last_value or last_created_at.initial_value
@@ -115,7 +117,7 @@ def github_repo_events(
115
117
  if last_created_at.end_value
116
118
  else pendulum.now()
117
119
  )
118
-
120
+
119
121
  for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
120
122
  # Filter events by date range
121
123
  filtered_events = []
@@ -61,6 +61,7 @@ def get_stargazers(
61
61
  page_items,
62
62
  )
63
63
 
64
+
64
65
  def get_reactions_data(
65
66
  node_type: str,
66
67
  owner: str,
@@ -7,6 +7,7 @@ from dlt.sources import DltResource
7
7
 
8
8
  from .client import InfluxClient
9
9
 
10
+
10
11
  @dlt.source(max_table_nesting=0)
11
12
  def influxdb_source(
12
13
  measurement: str,
@@ -3,7 +3,22 @@ from typing import Any, Dict, Iterable, Iterator
3
3
  import dlt
4
4
  import pendulum
5
5
 
6
- from .helpers import _normalize_issue, _normalize_team, _paginate
6
+ from .helpers import _paginate, normalize_dictionaries
7
+
8
+
9
+ def _get_date_range(updated_at, start_date):
10
+ """Extract current start and end dates from incremental state."""
11
+ if updated_at.last_value:
12
+ current_start_date = pendulum.parse(updated_at.last_value)
13
+ else:
14
+ current_start_date = pendulum.parse(start_date)
15
+
16
+ if updated_at.end_value:
17
+ current_end_date = pendulum.parse(updated_at.end_value)
18
+ else:
19
+ current_end_date = pendulum.now(tz="UTC")
20
+
21
+ return current_start_date, current_end_date
7
22
 
8
23
  ISSUES_QUERY = """
9
24
  query Issues($cursor: String) {
@@ -84,7 +99,25 @@ query Users($cursor: String) {
84
99
  }
85
100
  }
86
101
  """
87
-
102
+ WORKFLOW_STATES_QUERY = """
103
+ query WorkflowStates($cursor: String) {
104
+ workflowStates(first: 50, after: $cursor) {
105
+ nodes {
106
+ archivedAt
107
+ color
108
+ createdAt
109
+ id
110
+ inheritedFrom { id }
111
+ name
112
+ position
113
+ team { id }
114
+ type
115
+ updatedAt
116
+ }
117
+ pageInfo { hasNextPage endCursor }
118
+ }
119
+ }
120
+ """
88
121
 
89
122
  @dlt.source(name="linear", max_table_nesting=0)
90
123
  def linear_source(
@@ -102,20 +135,12 @@ def linear_source(
102
135
  range_end="closed",
103
136
  ),
104
137
  ) -> Iterator[Dict[str, Any]]:
105
- if updated_at.last_value:
106
- current_start_date = pendulum.parse(updated_at.last_value)
107
- else:
108
- current_start_date = pendulum.parse(start_date)
109
-
110
- if updated_at.end_value:
111
- current_end_date = pendulum.parse(updated_at.end_value)
112
- else:
113
- current_end_date = pendulum.now(tz="UTC")
138
+ current_start_date, current_end_date = _get_date_range(updated_at, start_date)
114
139
 
115
140
  for item in _paginate(api_key, ISSUES_QUERY, "issues"):
116
141
  if pendulum.parse(item["updatedAt"]) >= current_start_date:
117
142
  if pendulum.parse(item["updatedAt"]) <= current_end_date:
118
- yield _normalize_issue(item)
143
+ yield normalize_dictionaries(item)
119
144
 
120
145
  @dlt.resource(name="projects", primary_key="id", write_disposition="merge")
121
146
  def projects(
@@ -127,20 +152,12 @@ def linear_source(
127
152
  range_end="closed",
128
153
  ),
129
154
  ) -> Iterator[Dict[str, Any]]:
130
- if updated_at.last_value:
131
- current_start_date = pendulum.parse(updated_at.last_value)
132
- else:
133
- current_start_date = pendulum.parse(start_date)
134
-
135
- if updated_at.end_value:
136
- current_end_date = pendulum.parse(updated_at.end_value)
137
- else:
138
- current_end_date = pendulum.now(tz="UTC")
155
+ current_start_date, current_end_date = _get_date_range(updated_at, start_date)
139
156
 
140
157
  for item in _paginate(api_key, PROJECTS_QUERY, "projects"):
141
158
  if pendulum.parse(item["updatedAt"]) >= current_start_date:
142
159
  if pendulum.parse(item["updatedAt"]) <= current_end_date:
143
- yield item
160
+ yield normalize_dictionaries(item)
144
161
 
145
162
  @dlt.resource(name="teams", primary_key="id", write_disposition="merge")
146
163
  def teams(
@@ -153,21 +170,13 @@ def linear_source(
153
170
  ),
154
171
  ) -> Iterator[Dict[str, Any]]:
155
172
  print(start_date)
156
- if updated_at.last_value:
157
- current_start_date = pendulum.parse(updated_at.last_value)
158
- else:
159
- current_start_date = pendulum.parse(start_date)
173
+ current_start_date, current_end_date = _get_date_range(updated_at, start_date)
160
174
  print(current_start_date)
161
175
 
162
- if updated_at.end_value:
163
- current_end_date = pendulum.parse(updated_at.end_value)
164
- else:
165
- current_end_date = pendulum.now(tz="UTC")
166
-
167
176
  for item in _paginate(api_key, TEAMS_QUERY, "teams"):
168
177
  if pendulum.parse(item["updatedAt"]) >= current_start_date:
169
178
  if pendulum.parse(item["updatedAt"]) <= current_end_date:
170
- yield _normalize_team(item)
179
+ yield normalize_dictionaries(item)
171
180
 
172
181
  @dlt.resource(name="users", primary_key="id", write_disposition="merge")
173
182
  def users(
@@ -179,19 +188,28 @@ def linear_source(
179
188
  range_end="closed",
180
189
  ),
181
190
  ) -> Iterator[Dict[str, Any]]:
182
- if updated_at.last_value:
183
- current_start_date = pendulum.parse(updated_at.last_value)
184
- else:
185
- current_start_date = pendulum.parse(start_date)
186
-
187
- if updated_at.end_value:
188
- current_end_date = pendulum.parse(updated_at.end_value)
189
- else:
190
- current_end_date = pendulum.now(tz="UTC")
191
+ current_start_date, current_end_date = _get_date_range(updated_at, start_date)
191
192
 
192
193
  for item in _paginate(api_key, USERS_QUERY, "users"):
193
194
  if pendulum.parse(item["updatedAt"]) >= current_start_date:
194
195
  if pendulum.parse(item["updatedAt"]) <= current_end_date:
195
- yield item
196
+ yield normalize_dictionaries(item)
197
+
198
+ @dlt.resource(name="workflow_states", primary_key="id", write_disposition="merge")
199
+ def workflow_states(
200
+ updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
201
+ "updatedAt",
202
+ initial_value=start_date.isoformat(),
203
+ end_value=end_date.isoformat() if end_date else None,
204
+ range_start="closed",
205
+ range_end="closed",
206
+ ),
207
+ ) -> Iterator[Dict[str, Any]]:
208
+ current_start_date, current_end_date = _get_date_range(updated_at, start_date)
209
+
210
+ for item in _paginate(api_key, WORKFLOW_STATES_QUERY, "workflowStates"):
211
+ if pendulum.parse(item["updatedAt"]) >= current_start_date:
212
+ if pendulum.parse(item["updatedAt"]) <= current_end_date:
213
+ yield normalize_dictionaries(item)
214
+ return [issues, projects, teams, users, workflow_states]
196
215
 
197
- return issues, projects, teams, users
@@ -32,41 +32,24 @@ def _paginate(api_key: str, query: str, root: str) -> Iterator[Dict[str, Any]]:
32
32
  cursor = data["pageInfo"]["endCursor"]
33
33
 
34
34
 
35
- def _normalize_issue(item: Dict[str, Any]) -> Dict[str, Any]:
36
- field_mapping = {
37
- "assignee": "assignee_id",
38
- "creator": "creator_id",
39
- "state": "state_id",
40
- "cycle": "cycle_id",
41
- "project": "project_id",
42
- }
43
- for key, value in field_mapping.items():
44
- if item.get(key):
45
- item[value] = item[key]["id"]
46
- del item[key]
47
- else:
48
- item[value] = None
49
- del item[key]
50
- json_fields = [
51
- "comments",
52
- "subscribers",
53
- "attachments",
54
- "labels",
55
- "subtasks",
56
- "projects",
57
- "memberships",
58
- "members",
59
- ]
60
- for field in json_fields:
61
- if item.get(field):
62
- item[f"{field}"] = item[field].get("nodes", [])
63
35
 
64
- return item
65
36
 
66
-
67
- def _normalize_team(item: Dict[str, Any]) -> Dict[str, Any]:
68
- json_fields = ["memberships", "members", "projects"]
69
- for field in json_fields:
70
- if item.get(field):
71
- item[f"{field}"] = item[field].get("nodes", [])
72
- return item
37
+ def normalize_dictionaries(item: Dict[str, Any]) -> Dict[str, Any]:
38
+ """
39
+ Automatically normalize dictionary fields by detecting their structure:
40
+ - Convert nested objects with 'id' field to {field_name}_id
41
+ - Convert objects with 'nodes' field to arrays
42
+ """
43
+ normalized_item = item.copy()
44
+
45
+ for key, value in list(normalized_item.items()):
46
+ if isinstance(value, dict):
47
+ # If the dict has an 'id' field, replace with {key}_id
48
+ if 'id' in value:
49
+ normalized_item[f"{key}_id"] = value['id']
50
+ del normalized_item[key]
51
+ # If the dict has 'nodes' field, extract the nodes array
52
+ elif 'nodes' in value:
53
+ normalized_item[key] = value['nodes']
54
+
55
+ return normalized_item
@@ -106,6 +106,7 @@ def mongodb_collection(
106
106
  filter_: Optional[Dict[str, Any]] = None,
107
107
  projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
108
108
  pymongoarrow_schema: Optional[Any] = None,
109
+ custom_query: Optional[List[Dict[str, Any]]] = None,
109
110
  ) -> Any:
110
111
  """
111
112
  A DLT source which loads a collection from a mongo database using PyMongo.
@@ -132,6 +133,7 @@ def mongodb_collection(
132
133
  exclude (dict) - {"released": False, "runtime": False}
133
134
  Note: Can't mix include and exclude statements '{"title": True, "released": False}`
134
135
  pymongoarrow_schema (pymongoarrow.schema.Schema): Mapping of expected field types to convert BSON to Arrow
136
+ custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
135
137
 
136
138
  Returns:
137
139
  Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -161,4 +163,5 @@ def mongodb_collection(
161
163
  filter_=filter_ or {},
162
164
  projection=projection,
163
165
  pymongoarrow_schema=pymongoarrow_schema,
166
+ custom_query=custom_query,
164
167
  )