ingestr 0.13.78__py3-none-any.whl → 0.13.80__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +10 -3
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/destinations.py +18 -0
- ingestr/src/facebook_ads/__init__.py +6 -2
- ingestr/src/facebook_ads/helpers.py +1 -1
- ingestr/src/factory.py +5 -0
- ingestr/src/freshdesk/__init__.py +23 -8
- ingestr/src/freshdesk/freshdesk_client.py +16 -5
- ingestr/src/github/__init__.py +5 -3
- ingestr/src/github/helpers.py +1 -0
- ingestr/src/influxdb/__init__.py +1 -0
- ingestr/src/linear/__init__.py +61 -43
- ingestr/src/linear/helpers.py +19 -36
- ingestr/src/mongodb/__init__.py +3 -0
- ingestr/src/mongodb/helpers.py +178 -11
- ingestr/src/sources.py +311 -25
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/METADATA +6 -1
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/RECORD +21 -21
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/WHEEL +0 -0
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/licenses/LICENSE.md +0 -0
ingestr/main.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import Optional
|
|
@@ -8,6 +9,14 @@ from typing_extensions import Annotated
|
|
|
8
9
|
|
|
9
10
|
from ingestr.src.telemetry.event import track
|
|
10
11
|
|
|
12
|
+
try:
|
|
13
|
+
from duckdb_engine import DuckDBEngineWarning
|
|
14
|
+
|
|
15
|
+
warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
|
|
16
|
+
except ImportError:
|
|
17
|
+
# duckdb-engine not installed
|
|
18
|
+
pass
|
|
19
|
+
|
|
11
20
|
app = typer.Typer(
|
|
12
21
|
name="ingestr",
|
|
13
22
|
help="ingestr is the CLI tool to ingest data from one source to another",
|
|
@@ -506,7 +515,6 @@ def ingest(
|
|
|
506
515
|
|
|
507
516
|
if factory.source_scheme == "sqlite":
|
|
508
517
|
source_table = "main." + source_table.split(".")[-1]
|
|
509
|
-
|
|
510
518
|
|
|
511
519
|
if (
|
|
512
520
|
incremental_key
|
|
@@ -600,10 +608,9 @@ def ingest(
|
|
|
600
608
|
if factory.source_scheme == "influxdb":
|
|
601
609
|
if primary_key:
|
|
602
610
|
write_disposition = "merge"
|
|
603
|
-
|
|
604
611
|
|
|
605
612
|
start_time = datetime.now()
|
|
606
|
-
|
|
613
|
+
|
|
607
614
|
run_info: LoadInfo = pipeline.run(
|
|
608
615
|
dlt_source,
|
|
609
616
|
**destination.dlt_run_params(
|
ingestr/src/buildinfo.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
version = "v0.13.
|
|
1
|
+
version = "v0.13.80"
|
ingestr/src/destinations.py
CHANGED
|
@@ -147,6 +147,24 @@ class DuckDBDestination(GenericSqlDestination):
|
|
|
147
147
|
return dlt.destinations.duckdb(uri, **kwargs)
|
|
148
148
|
|
|
149
149
|
|
|
150
|
+
class MotherduckDestination(GenericSqlDestination):
|
|
151
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
152
|
+
from urllib.parse import parse_qs, urlparse
|
|
153
|
+
|
|
154
|
+
parsed = urlparse(uri)
|
|
155
|
+
query = parse_qs(parsed.query)
|
|
156
|
+
token = query.get("token", [None])[0]
|
|
157
|
+
from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
|
|
158
|
+
|
|
159
|
+
creds = {
|
|
160
|
+
"password": token,
|
|
161
|
+
}
|
|
162
|
+
if parsed.path.lstrip("/"):
|
|
163
|
+
creds["database"] = parsed.path.lstrip("/")
|
|
164
|
+
|
|
165
|
+
return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
|
|
166
|
+
|
|
167
|
+
|
|
150
168
|
def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
|
|
151
169
|
# ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
|
|
152
170
|
tup = struct.unpack(
|
|
@@ -26,7 +26,6 @@ from .settings import (
|
|
|
26
26
|
DEFAULT_LEAD_FIELDS,
|
|
27
27
|
INSIGHT_FIELDS_TYPES,
|
|
28
28
|
INSIGHTS_BREAKDOWNS_OPTIONS,
|
|
29
|
-
INSIGHTS_PRIMARY_KEY,
|
|
30
29
|
INVALID_INSIGHTS_FIELDS,
|
|
31
30
|
TInsightsBreakdownOptions,
|
|
32
31
|
TInsightsLevels,
|
|
@@ -118,6 +117,9 @@ def facebook_insights_source(
|
|
|
118
117
|
app_api_version: str = None,
|
|
119
118
|
start_date: pendulum.DateTime | None = None,
|
|
120
119
|
end_date: pendulum.DateTime | None = None,
|
|
120
|
+
insights_max_wait_to_finish_seconds: int = 60 * 60 * 4,
|
|
121
|
+
insights_max_wait_to_start_seconds: int = 60 * 30,
|
|
122
|
+
insights_max_async_sleep_seconds: int = 20,
|
|
121
123
|
) -> DltResource:
|
|
122
124
|
"""Incrementally loads insight reports with defined granularity level, fields, breakdowns etc.
|
|
123
125
|
|
|
@@ -207,7 +209,9 @@ def facebook_insights_source(
|
|
|
207
209
|
}
|
|
208
210
|
job = execute_job(
|
|
209
211
|
account.get_insights(params=query, is_async=True),
|
|
210
|
-
insights_max_async_sleep_seconds=
|
|
212
|
+
insights_max_async_sleep_seconds=insights_max_async_sleep_seconds,
|
|
213
|
+
insights_max_wait_to_finish_seconds=insights_max_wait_to_finish_seconds,
|
|
214
|
+
insights_max_wait_to_start_seconds=insights_max_wait_to_start_seconds,
|
|
211
215
|
)
|
|
212
216
|
output = list(map(process_report_item, job.get_result()))
|
|
213
217
|
yield output
|
ingestr/src/factory.py
CHANGED
|
@@ -12,6 +12,7 @@ from ingestr.src.destinations import (
|
|
|
12
12
|
DatabricksDestination,
|
|
13
13
|
DuckDBDestination,
|
|
14
14
|
GCSDestination,
|
|
15
|
+
MotherduckDestination,
|
|
15
16
|
MsSQLDestination,
|
|
16
17
|
MySqlDestination,
|
|
17
18
|
PostgresDestination,
|
|
@@ -85,6 +86,8 @@ SQL_SOURCE_SCHEMES = [
|
|
|
85
86
|
"mysql",
|
|
86
87
|
"mysql+pymysql",
|
|
87
88
|
"mysql+mysqlconnector",
|
|
89
|
+
"md",
|
|
90
|
+
"motherduck",
|
|
88
91
|
"postgres",
|
|
89
92
|
"postgresql",
|
|
90
93
|
"postgresql+psycopg2",
|
|
@@ -195,6 +198,8 @@ class SourceDestinationFactory:
|
|
|
195
198
|
"cratedb": CrateDBDestination,
|
|
196
199
|
"databricks": DatabricksDestination,
|
|
197
200
|
"duckdb": DuckDBDestination,
|
|
201
|
+
"motherduck": MotherduckDestination,
|
|
202
|
+
"md": MotherduckDestination,
|
|
198
203
|
"mssql": MsSQLDestination,
|
|
199
204
|
"postgres": PostgresDestination,
|
|
200
205
|
"postgresql": PostgresDestination,
|
|
@@ -4,6 +4,8 @@ etc. to the database"""
|
|
|
4
4
|
from typing import Any, Dict, Generator, Iterable, List, Optional
|
|
5
5
|
|
|
6
6
|
import dlt
|
|
7
|
+
import pendulum
|
|
8
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
7
9
|
from dlt.sources import DltResource
|
|
8
10
|
|
|
9
11
|
from .freshdesk_client import FreshdeskClient
|
|
@@ -12,10 +14,12 @@ from .settings import DEFAULT_ENDPOINTS
|
|
|
12
14
|
|
|
13
15
|
@dlt.source()
|
|
14
16
|
def freshdesk_source(
|
|
15
|
-
|
|
17
|
+
domain: str,
|
|
18
|
+
api_secret_key: str,
|
|
19
|
+
start_date: pendulum.DateTime,
|
|
20
|
+
end_date: Optional[pendulum.DateTime] = None,
|
|
16
21
|
per_page: int = 100,
|
|
17
|
-
|
|
18
|
-
api_secret_key: str = dlt.secrets.value,
|
|
22
|
+
endpoints: Optional[List[str]] = None,
|
|
19
23
|
) -> Iterable[DltResource]:
|
|
20
24
|
"""
|
|
21
25
|
Retrieves data from specified Freshdesk API endpoints.
|
|
@@ -39,7 +43,11 @@ def freshdesk_source(
|
|
|
39
43
|
def incremental_resource(
|
|
40
44
|
endpoint: str,
|
|
41
45
|
updated_at: Optional[Any] = dlt.sources.incremental(
|
|
42
|
-
"updated_at",
|
|
46
|
+
"updated_at",
|
|
47
|
+
initial_value=start_date.isoformat(),
|
|
48
|
+
end_value=end_date.isoformat() if end_date else None,
|
|
49
|
+
range_start="closed",
|
|
50
|
+
range_end="closed",
|
|
43
51
|
),
|
|
44
52
|
) -> Generator[Dict[Any, Any], Any, None]:
|
|
45
53
|
"""
|
|
@@ -48,15 +56,22 @@ def freshdesk_source(
|
|
|
48
56
|
to ensure incremental loading.
|
|
49
57
|
"""
|
|
50
58
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
59
|
+
if updated_at.last_value is not None:
|
|
60
|
+
start_date = ensure_pendulum_datetime(updated_at.last_value)
|
|
61
|
+
else:
|
|
62
|
+
start_date = start_date
|
|
63
|
+
|
|
64
|
+
if updated_at.end_value is not None:
|
|
65
|
+
end_date = ensure_pendulum_datetime(updated_at.end_value)
|
|
66
|
+
else:
|
|
67
|
+
end_date = pendulum.now(tz="UTC")
|
|
54
68
|
|
|
55
69
|
# Use the FreshdeskClient instance to fetch paginated responses
|
|
56
70
|
yield from freshdesk.paginated_response(
|
|
57
71
|
endpoint=endpoint,
|
|
58
72
|
per_page=per_page,
|
|
59
|
-
|
|
73
|
+
start_date=start_date,
|
|
74
|
+
end_date=end_date,
|
|
60
75
|
)
|
|
61
76
|
|
|
62
77
|
# Set default endpoints if not provided
|
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import time
|
|
5
|
-
from typing import Any, Dict, Iterable
|
|
5
|
+
from typing import Any, Dict, Iterable
|
|
6
6
|
|
|
7
|
+
import pendulum
|
|
7
8
|
from dlt.common.typing import TDataItem
|
|
8
9
|
from dlt.sources.helpers import requests
|
|
9
10
|
|
|
@@ -67,7 +68,8 @@ class FreshdeskClient:
|
|
|
67
68
|
self,
|
|
68
69
|
endpoint: str,
|
|
69
70
|
per_page: int,
|
|
70
|
-
|
|
71
|
+
start_date: pendulum.DateTime,
|
|
72
|
+
end_date: pendulum.DateTime,
|
|
71
73
|
) -> Iterable[TDataItem]:
|
|
72
74
|
"""
|
|
73
75
|
Fetches a paginated response from a specified endpoint.
|
|
@@ -88,8 +90,8 @@ class FreshdeskClient:
|
|
|
88
90
|
param_key = (
|
|
89
91
|
"updated_since" if endpoint == "tickets" else "_updated_since"
|
|
90
92
|
)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
+
|
|
94
|
+
params[param_key] = start_date.to_iso8601_string()
|
|
93
95
|
|
|
94
96
|
# Handle requests with rate-limiting
|
|
95
97
|
# A maximum of 300 pages (30000 tickets) will be returned.
|
|
@@ -98,5 +100,14 @@ class FreshdeskClient:
|
|
|
98
100
|
|
|
99
101
|
if not data:
|
|
100
102
|
break # Stop if no data or max page limit reached
|
|
101
|
-
|
|
103
|
+
|
|
104
|
+
filtered_data = [
|
|
105
|
+
item
|
|
106
|
+
for item in data
|
|
107
|
+
if "updated_at" in item
|
|
108
|
+
and pendulum.parse(item["updated_at"]) <= end_date
|
|
109
|
+
]
|
|
110
|
+
if not filtered_data:
|
|
111
|
+
break
|
|
112
|
+
yield filtered_data
|
|
102
113
|
page += 1
|
ingestr/src/github/__init__.py
CHANGED
|
@@ -91,7 +91,9 @@ def github_repo_events(
|
|
|
91
91
|
"""
|
|
92
92
|
|
|
93
93
|
# use naming function in table name to generate separate tables for each event
|
|
94
|
-
@dlt.resource(
|
|
94
|
+
@dlt.resource(
|
|
95
|
+
primary_key="id", table_name=lambda i: i["type"], write_disposition="merge"
|
|
96
|
+
)
|
|
95
97
|
def repo_events(
|
|
96
98
|
last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
97
99
|
"created_at",
|
|
@@ -105,7 +107,7 @@ def github_repo_events(
|
|
|
105
107
|
repos_path = (
|
|
106
108
|
f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
|
|
107
109
|
)
|
|
108
|
-
|
|
110
|
+
|
|
109
111
|
# Get the date range from the incremental state
|
|
110
112
|
start_filter = pendulum.parse(
|
|
111
113
|
last_created_at.last_value or last_created_at.initial_value
|
|
@@ -115,7 +117,7 @@ def github_repo_events(
|
|
|
115
117
|
if last_created_at.end_value
|
|
116
118
|
else pendulum.now()
|
|
117
119
|
)
|
|
118
|
-
|
|
120
|
+
|
|
119
121
|
for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
|
|
120
122
|
# Filter events by date range
|
|
121
123
|
filtered_events = []
|
ingestr/src/github/helpers.py
CHANGED
ingestr/src/influxdb/__init__.py
CHANGED
ingestr/src/linear/__init__.py
CHANGED
|
@@ -3,7 +3,22 @@ from typing import Any, Dict, Iterable, Iterator
|
|
|
3
3
|
import dlt
|
|
4
4
|
import pendulum
|
|
5
5
|
|
|
6
|
-
from .helpers import
|
|
6
|
+
from .helpers import _paginate, normalize_dictionaries
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_date_range(updated_at, start_date):
|
|
10
|
+
"""Extract current start and end dates from incremental state."""
|
|
11
|
+
if updated_at.last_value:
|
|
12
|
+
current_start_date = pendulum.parse(updated_at.last_value)
|
|
13
|
+
else:
|
|
14
|
+
current_start_date = pendulum.parse(start_date)
|
|
15
|
+
|
|
16
|
+
if updated_at.end_value:
|
|
17
|
+
current_end_date = pendulum.parse(updated_at.end_value)
|
|
18
|
+
else:
|
|
19
|
+
current_end_date = pendulum.now(tz="UTC")
|
|
20
|
+
|
|
21
|
+
return current_start_date, current_end_date
|
|
7
22
|
|
|
8
23
|
ISSUES_QUERY = """
|
|
9
24
|
query Issues($cursor: String) {
|
|
@@ -84,7 +99,25 @@ query Users($cursor: String) {
|
|
|
84
99
|
}
|
|
85
100
|
}
|
|
86
101
|
"""
|
|
87
|
-
|
|
102
|
+
WORKFLOW_STATES_QUERY = """
|
|
103
|
+
query WorkflowStates($cursor: String) {
|
|
104
|
+
workflowStates(first: 50, after: $cursor) {
|
|
105
|
+
nodes {
|
|
106
|
+
archivedAt
|
|
107
|
+
color
|
|
108
|
+
createdAt
|
|
109
|
+
id
|
|
110
|
+
inheritedFrom { id }
|
|
111
|
+
name
|
|
112
|
+
position
|
|
113
|
+
team { id }
|
|
114
|
+
type
|
|
115
|
+
updatedAt
|
|
116
|
+
}
|
|
117
|
+
pageInfo { hasNextPage endCursor }
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
"""
|
|
88
121
|
|
|
89
122
|
@dlt.source(name="linear", max_table_nesting=0)
|
|
90
123
|
def linear_source(
|
|
@@ -102,20 +135,12 @@ def linear_source(
|
|
|
102
135
|
range_end="closed",
|
|
103
136
|
),
|
|
104
137
|
) -> Iterator[Dict[str, Any]]:
|
|
105
|
-
|
|
106
|
-
current_start_date = pendulum.parse(updated_at.last_value)
|
|
107
|
-
else:
|
|
108
|
-
current_start_date = pendulum.parse(start_date)
|
|
109
|
-
|
|
110
|
-
if updated_at.end_value:
|
|
111
|
-
current_end_date = pendulum.parse(updated_at.end_value)
|
|
112
|
-
else:
|
|
113
|
-
current_end_date = pendulum.now(tz="UTC")
|
|
138
|
+
current_start_date, current_end_date = _get_date_range(updated_at, start_date)
|
|
114
139
|
|
|
115
140
|
for item in _paginate(api_key, ISSUES_QUERY, "issues"):
|
|
116
141
|
if pendulum.parse(item["updatedAt"]) >= current_start_date:
|
|
117
142
|
if pendulum.parse(item["updatedAt"]) <= current_end_date:
|
|
118
|
-
yield
|
|
143
|
+
yield normalize_dictionaries(item)
|
|
119
144
|
|
|
120
145
|
@dlt.resource(name="projects", primary_key="id", write_disposition="merge")
|
|
121
146
|
def projects(
|
|
@@ -127,20 +152,12 @@ def linear_source(
|
|
|
127
152
|
range_end="closed",
|
|
128
153
|
),
|
|
129
154
|
) -> Iterator[Dict[str, Any]]:
|
|
130
|
-
|
|
131
|
-
current_start_date = pendulum.parse(updated_at.last_value)
|
|
132
|
-
else:
|
|
133
|
-
current_start_date = pendulum.parse(start_date)
|
|
134
|
-
|
|
135
|
-
if updated_at.end_value:
|
|
136
|
-
current_end_date = pendulum.parse(updated_at.end_value)
|
|
137
|
-
else:
|
|
138
|
-
current_end_date = pendulum.now(tz="UTC")
|
|
155
|
+
current_start_date, current_end_date = _get_date_range(updated_at, start_date)
|
|
139
156
|
|
|
140
157
|
for item in _paginate(api_key, PROJECTS_QUERY, "projects"):
|
|
141
158
|
if pendulum.parse(item["updatedAt"]) >= current_start_date:
|
|
142
159
|
if pendulum.parse(item["updatedAt"]) <= current_end_date:
|
|
143
|
-
yield item
|
|
160
|
+
yield normalize_dictionaries(item)
|
|
144
161
|
|
|
145
162
|
@dlt.resource(name="teams", primary_key="id", write_disposition="merge")
|
|
146
163
|
def teams(
|
|
@@ -153,21 +170,13 @@ def linear_source(
|
|
|
153
170
|
),
|
|
154
171
|
) -> Iterator[Dict[str, Any]]:
|
|
155
172
|
print(start_date)
|
|
156
|
-
|
|
157
|
-
current_start_date = pendulum.parse(updated_at.last_value)
|
|
158
|
-
else:
|
|
159
|
-
current_start_date = pendulum.parse(start_date)
|
|
173
|
+
current_start_date, current_end_date = _get_date_range(updated_at, start_date)
|
|
160
174
|
print(current_start_date)
|
|
161
175
|
|
|
162
|
-
if updated_at.end_value:
|
|
163
|
-
current_end_date = pendulum.parse(updated_at.end_value)
|
|
164
|
-
else:
|
|
165
|
-
current_end_date = pendulum.now(tz="UTC")
|
|
166
|
-
|
|
167
176
|
for item in _paginate(api_key, TEAMS_QUERY, "teams"):
|
|
168
177
|
if pendulum.parse(item["updatedAt"]) >= current_start_date:
|
|
169
178
|
if pendulum.parse(item["updatedAt"]) <= current_end_date:
|
|
170
|
-
yield
|
|
179
|
+
yield normalize_dictionaries(item)
|
|
171
180
|
|
|
172
181
|
@dlt.resource(name="users", primary_key="id", write_disposition="merge")
|
|
173
182
|
def users(
|
|
@@ -179,19 +188,28 @@ def linear_source(
|
|
|
179
188
|
range_end="closed",
|
|
180
189
|
),
|
|
181
190
|
) -> Iterator[Dict[str, Any]]:
|
|
182
|
-
|
|
183
|
-
current_start_date = pendulum.parse(updated_at.last_value)
|
|
184
|
-
else:
|
|
185
|
-
current_start_date = pendulum.parse(start_date)
|
|
186
|
-
|
|
187
|
-
if updated_at.end_value:
|
|
188
|
-
current_end_date = pendulum.parse(updated_at.end_value)
|
|
189
|
-
else:
|
|
190
|
-
current_end_date = pendulum.now(tz="UTC")
|
|
191
|
+
current_start_date, current_end_date = _get_date_range(updated_at, start_date)
|
|
191
192
|
|
|
192
193
|
for item in _paginate(api_key, USERS_QUERY, "users"):
|
|
193
194
|
if pendulum.parse(item["updatedAt"]) >= current_start_date:
|
|
194
195
|
if pendulum.parse(item["updatedAt"]) <= current_end_date:
|
|
195
|
-
yield item
|
|
196
|
+
yield normalize_dictionaries(item)
|
|
197
|
+
|
|
198
|
+
@dlt.resource(name="workflow_states", primary_key="id", write_disposition="merge")
|
|
199
|
+
def workflow_states(
|
|
200
|
+
updated_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
201
|
+
"updatedAt",
|
|
202
|
+
initial_value=start_date.isoformat(),
|
|
203
|
+
end_value=end_date.isoformat() if end_date else None,
|
|
204
|
+
range_start="closed",
|
|
205
|
+
range_end="closed",
|
|
206
|
+
),
|
|
207
|
+
) -> Iterator[Dict[str, Any]]:
|
|
208
|
+
current_start_date, current_end_date = _get_date_range(updated_at, start_date)
|
|
209
|
+
|
|
210
|
+
for item in _paginate(api_key, WORKFLOW_STATES_QUERY, "workflowStates"):
|
|
211
|
+
if pendulum.parse(item["updatedAt"]) >= current_start_date:
|
|
212
|
+
if pendulum.parse(item["updatedAt"]) <= current_end_date:
|
|
213
|
+
yield normalize_dictionaries(item)
|
|
214
|
+
return [issues, projects, teams, users, workflow_states]
|
|
196
215
|
|
|
197
|
-
return issues, projects, teams, users
|
ingestr/src/linear/helpers.py
CHANGED
|
@@ -32,41 +32,24 @@ def _paginate(api_key: str, query: str, root: str) -> Iterator[Dict[str, Any]]:
|
|
|
32
32
|
cursor = data["pageInfo"]["endCursor"]
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def _normalize_issue(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
36
|
-
field_mapping = {
|
|
37
|
-
"assignee": "assignee_id",
|
|
38
|
-
"creator": "creator_id",
|
|
39
|
-
"state": "state_id",
|
|
40
|
-
"cycle": "cycle_id",
|
|
41
|
-
"project": "project_id",
|
|
42
|
-
}
|
|
43
|
-
for key, value in field_mapping.items():
|
|
44
|
-
if item.get(key):
|
|
45
|
-
item[value] = item[key]["id"]
|
|
46
|
-
del item[key]
|
|
47
|
-
else:
|
|
48
|
-
item[value] = None
|
|
49
|
-
del item[key]
|
|
50
|
-
json_fields = [
|
|
51
|
-
"comments",
|
|
52
|
-
"subscribers",
|
|
53
|
-
"attachments",
|
|
54
|
-
"labels",
|
|
55
|
-
"subtasks",
|
|
56
|
-
"projects",
|
|
57
|
-
"memberships",
|
|
58
|
-
"members",
|
|
59
|
-
]
|
|
60
|
-
for field in json_fields:
|
|
61
|
-
if item.get(field):
|
|
62
|
-
item[f"{field}"] = item[field].get("nodes", [])
|
|
63
35
|
|
|
64
|
-
return item
|
|
65
36
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
37
|
+
def normalize_dictionaries(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
38
|
+
"""
|
|
39
|
+
Automatically normalize dictionary fields by detecting their structure:
|
|
40
|
+
- Convert nested objects with 'id' field to {field_name}_id
|
|
41
|
+
- Convert objects with 'nodes' field to arrays
|
|
42
|
+
"""
|
|
43
|
+
normalized_item = item.copy()
|
|
44
|
+
|
|
45
|
+
for key, value in list(normalized_item.items()):
|
|
46
|
+
if isinstance(value, dict):
|
|
47
|
+
# If the dict has an 'id' field, replace with {key}_id
|
|
48
|
+
if 'id' in value:
|
|
49
|
+
normalized_item[f"{key}_id"] = value['id']
|
|
50
|
+
del normalized_item[key]
|
|
51
|
+
# If the dict has 'nodes' field, extract the nodes array
|
|
52
|
+
elif 'nodes' in value:
|
|
53
|
+
normalized_item[key] = value['nodes']
|
|
54
|
+
|
|
55
|
+
return normalized_item
|
ingestr/src/mongodb/__init__.py
CHANGED
|
@@ -106,6 +106,7 @@ def mongodb_collection(
|
|
|
106
106
|
filter_: Optional[Dict[str, Any]] = None,
|
|
107
107
|
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
|
|
108
108
|
pymongoarrow_schema: Optional[Any] = None,
|
|
109
|
+
custom_query: Optional[List[Dict[str, Any]]] = None,
|
|
109
110
|
) -> Any:
|
|
110
111
|
"""
|
|
111
112
|
A DLT source which loads a collection from a mongo database using PyMongo.
|
|
@@ -132,6 +133,7 @@ def mongodb_collection(
|
|
|
132
133
|
exclude (dict) - {"released": False, "runtime": False}
|
|
133
134
|
Note: Can't mix include and exclude statements '{"title": True, "released": False}`
|
|
134
135
|
pymongoarrow_schema (pymongoarrow.schema.Schema): Mapping of expected field types to convert BSON to Arrow
|
|
136
|
+
custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
|
|
135
137
|
|
|
136
138
|
Returns:
|
|
137
139
|
Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
|
|
@@ -161,4 +163,5 @@ def mongodb_collection(
|
|
|
161
163
|
filter_=filter_ or {},
|
|
162
164
|
projection=projection,
|
|
163
165
|
pymongoarrow_schema=pymongoarrow_schema,
|
|
166
|
+
custom_query=custom_query,
|
|
164
167
|
)
|