ingestr 0.14.1__py3-none-any.whl → 0.14.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -0,0 +1,279 @@
1
+ """
2
+ Configuration settings and constants for Intercom API integration.
3
+ """
4
+
5
+ from datetime import datetime
6
+ from typing import Dict, List, Tuple
7
+
8
+ # API Version - REQUIRED for all requests
9
+ API_VERSION = "2.14"
10
+
11
+ # Default start date for incremental loading
12
+ DEFAULT_START_DATE = datetime(2020, 1, 1)
13
+
14
+ # Pagination settings
15
+ DEFAULT_PAGE_SIZE = 150
16
+ MAX_PAGE_SIZE = 150 # Intercom's maximum
17
+ SCROLL_EXPIRY_SECONDS = 60 # Scroll sessions expire after 1 minute
18
+
19
+ # Rate limiting settings
20
+ RATE_LIMIT_PER_10_SECONDS = 166
21
+ RATE_LIMIT_RETRY_AFTER_DEFAULT = 10
22
+
23
+ # Regional API endpoints
24
+ REGIONAL_ENDPOINTS = {
25
+ "us": "https://api.intercom.io",
26
+ "eu": "https://api.eu.intercom.io",
27
+ "au": "https://api.au.intercom.io",
28
+ }
29
+
30
+ # Resource configuration for automatic generation
31
+ # Format: resource_name -> config dict
32
+ RESOURCE_CONFIGS = {
33
+ # Search-based incremental resources
34
+ "contacts": {
35
+ "type": "search",
36
+ "incremental": True,
37
+ "transform_func": "transform_contact",
38
+ "columns": {
39
+ "custom_attributes": {"data_type": "json"},
40
+ "tags": {"data_type": "json"},
41
+ },
42
+ },
43
+ "conversations": {
44
+ "type": "search",
45
+ "incremental": True,
46
+ "transform_func": "transform_conversation",
47
+ "columns": {
48
+ "custom_attributes": {"data_type": "json"},
49
+ "tags": {"data_type": "json"},
50
+ },
51
+ },
52
+ # Pagination-based incremental resources
53
+ "companies": {
54
+ "type": "pagination",
55
+ "endpoint": "/companies",
56
+ "data_key": "data",
57
+ "pagination_type": "cursor",
58
+ "incremental": True,
59
+ "transform_func": "transform_company",
60
+ "params": {"per_page": 50},
61
+ "columns": {
62
+ "custom_attributes": {"data_type": "json"},
63
+ "tags": {"data_type": "json"},
64
+ },
65
+ },
66
+ "articles": {
67
+ "type": "pagination",
68
+ "endpoint": "/articles",
69
+ "data_key": "data",
70
+ "pagination_type": "cursor",
71
+ "incremental": True,
72
+ "transform_func": None,
73
+ "params": None,
74
+ "columns": {},
75
+ },
76
+ # Special case - tickets
77
+ "tickets": {
78
+ "type": "tickets",
79
+ "incremental": True,
80
+ "transform_func": None,
81
+ "columns": {
82
+ "ticket_attributes": {"data_type": "json"},
83
+ },
84
+ },
85
+ # Simple replace resources (non-incremental)
86
+ "tags": {
87
+ "type": "simple",
88
+ "endpoint": "/tags",
89
+ "data_key": "data",
90
+ "pagination_type": "simple",
91
+ "incremental": False,
92
+ "transform_func": None,
93
+ "columns": {},
94
+ },
95
+ "segments": {
96
+ "type": "simple",
97
+ "endpoint": "/segments",
98
+ "data_key": "segments",
99
+ "pagination_type": "cursor",
100
+ "incremental": False,
101
+ "transform_func": None,
102
+ "columns": {},
103
+ },
104
+ "teams": {
105
+ "type": "simple",
106
+ "endpoint": "/teams",
107
+ "data_key": "teams",
108
+ "pagination_type": "simple",
109
+ "incremental": False,
110
+ "transform_func": None,
111
+ "columns": {},
112
+ },
113
+ "admins": {
114
+ "type": "simple",
115
+ "endpoint": "/admins",
116
+ "data_key": "admins",
117
+ "pagination_type": "simple",
118
+ "incremental": False,
119
+ "transform_func": None,
120
+ "columns": {},
121
+ },
122
+ "data_attributes": {
123
+ "type": "simple",
124
+ "endpoint": "/data_attributes",
125
+ "data_key": "data",
126
+ "pagination_type": "cursor",
127
+ "incremental": False,
128
+ "transform_func": None,
129
+ "columns": {
130
+ "id": {"data_type": "bigint", "nullable": True},
131
+ },
132
+ },
133
+ }
134
+
135
+ # Core endpoints with their configuration (kept for backwards compatibility)
136
+ # Format: (endpoint_path, data_key, supports_incremental, pagination_type)
137
+ CORE_ENDPOINTS: Dict[str, Tuple[str, str, bool, str]] = {
138
+ "contacts": ("/contacts", "data", True, "cursor"),
139
+ "companies": ("/companies", "data", True, "cursor"),
140
+ "conversations": ("/conversations", "conversations", True, "cursor"),
141
+ "tickets": ("/tickets", "tickets", True, "cursor"),
142
+ "admins": ("/admins", "admins", False, "simple"),
143
+ "teams": ("/teams", "teams", False, "simple"),
144
+ "tags": ("/tags", "data", False, "simple"),
145
+ "segments": ("/segments", "segments", False, "cursor"),
146
+ "articles": ("/articles", "data", True, "cursor"),
147
+ "collections": ("/help_center/collections", "data", False, "cursor"),
148
+ "data_attributes": ("/data_attributes", "data", False, "cursor"),
149
+ }
150
+
151
+ # Incremental endpoints using search API
152
+ SEARCH_ENDPOINTS: Dict[str, str] = {
153
+ "contacts_search": "/contacts/search",
154
+ "companies_search": "/companies/search",
155
+ "conversations_search": "/conversations/search",
156
+ }
157
+
158
+ # Special endpoints requiring different handling
159
+ SCROLL_ENDPOINTS: List[str] = [
160
+ "companies", # Can use scroll for large exports
161
+ ]
162
+
163
+ # Event tracking endpoint
164
+ EVENTS_ENDPOINT = "/events"
165
+
166
+ # Ticket fields endpoint for custom field mapping
167
+ TICKET_FIELDS_ENDPOINT = "/ticket_types/{ticket_type_id}/attributes"
168
+
169
+ # Default fields to retrieve for each resource type
170
+ DEFAULT_CONTACT_FIELDS = [
171
+ "id",
172
+ "type",
173
+ "external_id",
174
+ "email",
175
+ "phone",
176
+ "name",
177
+ "created_at",
178
+ "updated_at",
179
+ "signed_up_at",
180
+ "last_seen_at",
181
+ "last_contacted_at",
182
+ "last_email_opened_at",
183
+ "last_email_clicked_at",
184
+ "browser",
185
+ "browser_language",
186
+ "browser_version",
187
+ "location",
188
+ "os",
189
+ "role",
190
+ "custom_attributes",
191
+ "tags",
192
+ "companies",
193
+ ]
194
+
195
+ DEFAULT_COMPANY_FIELDS = [
196
+ "id",
197
+ "type",
198
+ "company_id",
199
+ "name",
200
+ "plan",
201
+ "size",
202
+ "website",
203
+ "industry",
204
+ "created_at",
205
+ "updated_at",
206
+ "monthly_spend",
207
+ "session_count",
208
+ "user_count",
209
+ "custom_attributes",
210
+ "tags",
211
+ ]
212
+
213
+ DEFAULT_CONVERSATION_FIELDS = [
214
+ "id",
215
+ "type",
216
+ "created_at",
217
+ "updated_at",
218
+ "waiting_since",
219
+ "snoozed_until",
220
+ "state",
221
+ "open",
222
+ "read",
223
+ "priority",
224
+ "admin_assignee_id",
225
+ "team_assignee_id",
226
+ "tags",
227
+ "conversation_rating",
228
+ "source",
229
+ "contacts",
230
+ "teammates",
231
+ "custom_attributes",
232
+ "first_contact_reply",
233
+ "sla_applied",
234
+ "statistics",
235
+ "conversation_parts",
236
+ ]
237
+
238
+ DEFAULT_TICKET_FIELDS = [
239
+ "id",
240
+ "type",
241
+ "ticket_id",
242
+ "category",
243
+ "ticket_attributes",
244
+ "ticket_state",
245
+ "ticket_type",
246
+ "created_at",
247
+ "updated_at",
248
+ "ticket_parts",
249
+ "contacts",
250
+ "admin_assignee_id",
251
+ "team_assignee_id",
252
+ "open",
253
+ "snoozed_until",
254
+ ]
255
+
256
+ # Resources that support custom attributes
257
+ SUPPORTS_CUSTOM_ATTRIBUTES = [
258
+ "contacts",
259
+ "companies",
260
+ "conversations",
261
+ ]
262
+
263
+ # Maximum limits
264
+ MAX_CUSTOM_ATTRIBUTES_PER_RESOURCE = 100
265
+ MAX_EVENT_TYPES_PER_WORKSPACE = 120
266
+ MAX_CONVERSATION_PARTS = 500
267
+ MAX_SEARCH_RESULTS = 10000
268
+
269
+ # Field type mapping for custom attributes
270
+ INTERCOM_TO_DLT_TYPE_MAPPING = {
271
+ "string": "text",
272
+ "integer": "bigint",
273
+ "float": "double",
274
+ "boolean": "bool",
275
+ "date": "timestamp",
276
+ "datetime": "timestamp",
277
+ "object": "json",
278
+ "list": "json",
279
+ }
@@ -0,0 +1,314 @@
1
+ """
2
+ This source provides data extraction from Jira Cloud via the REST API v3.
3
+
4
+ It defines several functions to fetch data from different parts of Jira including
5
+ projects, issues, users, boards, sprints, and various configuration objects like
6
+ issue types, statuses, and priorities.
7
+ """
8
+
9
+ from typing import Any, Iterable, Optional
10
+
11
+ import dlt
12
+ from dlt.common.typing import TDataItem
13
+
14
+ from .helpers import get_client
15
+ from .settings import (
16
+ DEFAULT_PAGE_SIZE,
17
+ DEFAULT_START_DATE,
18
+ ISSUE_FIELDS,
19
+ )
20
+
21
+
22
+ @dlt.source
23
+ def jira_source() -> Any:
24
+ """
25
+ The main function that runs all the other functions to fetch data from Jira.
26
+
27
+ Returns:
28
+ Sequence[DltResource]: A sequence of DltResource objects containing the fetched data.
29
+ """
30
+ return [
31
+ projects,
32
+ issues,
33
+ users,
34
+ issue_types,
35
+ statuses,
36
+ priorities,
37
+ resolutions,
38
+ project_versions,
39
+ project_components,
40
+ ]
41
+
42
+
43
+ @dlt.resource(write_disposition="replace")
44
+ def projects(
45
+ base_url: str = dlt.secrets.value,
46
+ email: str = dlt.secrets.value,
47
+ api_token: str = dlt.secrets.value,
48
+ expand: Optional[str] = None,
49
+ recent: Optional[int] = None,
50
+ ) -> Iterable[TDataItem]:
51
+ """
52
+ Fetches and returns a list of projects from Jira.
53
+
54
+ Args:
55
+ base_url (str): Jira instance URL (e.g., https://your-domain.atlassian.net)
56
+ email (str): User email for authentication
57
+ api_token (str): API token for authentication
58
+ expand (str): Comma-separated list of fields to expand
59
+ recent (int): Number of recent projects to return
60
+
61
+ Yields:
62
+ dict: The project data.
63
+ """
64
+ client = get_client(base_url, email, api_token)
65
+ yield from client.get_projects(expand=expand, recent=recent)
66
+
67
+
68
+ @dlt.resource(write_disposition="merge", primary_key="id")
69
+ def issues(
70
+ base_url: str = dlt.secrets.value,
71
+ email: str = dlt.secrets.value,
72
+ api_token: str = dlt.secrets.value,
73
+ jql: str = "order by updated DESC",
74
+ fields: Optional[str] = None,
75
+ expand: Optional[str] = None,
76
+ max_results: Optional[int] = None,
77
+ updated: dlt.sources.incremental[str] = dlt.sources.incremental(
78
+ "fields.updated",
79
+ initial_value=DEFAULT_START_DATE,
80
+ range_end="closed",
81
+ range_start="closed",
82
+ ),
83
+ ) -> Iterable[TDataItem]:
84
+ """
85
+ Fetches issues from Jira using JQL search.
86
+
87
+ Args:
88
+ base_url (str): Jira instance URL
89
+ email (str): User email for authentication
90
+ api_token (str): API token for authentication
91
+ jql (str): JQL query string
92
+ fields (str): Comma-separated list of fields to return
93
+ expand (str): Comma-separated list of fields to expand
94
+ max_results (int): Maximum number of results to return
95
+ updated (str): The date from which to fetch updated issues
96
+
97
+ Yields:
98
+ dict: The issue data.
99
+ """
100
+ client = get_client(base_url, email, api_token)
101
+
102
+ # Build JQL with incremental filter
103
+ incremental_jql = jql
104
+ if updated.start_value:
105
+ date_filter = f"updated >= '{updated.start_value}'"
106
+
107
+ # Check if JQL has ORDER BY clause and handle it properly
108
+ jql_upper = jql.upper()
109
+ if "ORDER BY" in jql_upper:
110
+ # Split at ORDER BY and add filter before it
111
+ order_by_index = jql_upper.find("ORDER BY")
112
+ main_query = jql[:order_by_index].strip()
113
+ order_clause = jql[order_by_index:].strip()
114
+
115
+ if main_query and (
116
+ "WHERE" in main_query.upper()
117
+ or "AND" in main_query.upper()
118
+ or "OR" in main_query.upper()
119
+ ):
120
+ incremental_jql = f"({main_query}) AND {date_filter} {order_clause}"
121
+ else:
122
+ if main_query:
123
+ incremental_jql = f"{main_query} AND {date_filter} {order_clause}"
124
+ else:
125
+ incremental_jql = f"{date_filter} {order_clause}"
126
+ else:
127
+ # No ORDER BY clause, use original logic
128
+ if "WHERE" in jql_upper or "AND" in jql_upper or "OR" in jql_upper:
129
+ incremental_jql = f"({jql}) AND {date_filter}"
130
+ else:
131
+ incremental_jql = f"{jql} AND {date_filter}"
132
+
133
+ # Use default fields if not specified
134
+ if fields is None:
135
+ fields = ",".join(ISSUE_FIELDS)
136
+
137
+ yield from client.search_issues(
138
+ jql=incremental_jql, fields=fields, expand=expand, max_results=max_results
139
+ )
140
+
141
+
142
+ @dlt.resource(write_disposition="replace")
143
+ def users(
144
+ base_url: str = dlt.secrets.value,
145
+ email: str = dlt.secrets.value,
146
+ api_token: str = dlt.secrets.value,
147
+ username: Optional[str] = None,
148
+ account_id: Optional[str] = None,
149
+ max_results: int = DEFAULT_PAGE_SIZE,
150
+ ) -> Iterable[TDataItem]:
151
+ """
152
+ Fetches users from Jira.
153
+
154
+ Args:
155
+ base_url (str): Jira instance URL
156
+ email (str): User email for authentication
157
+ api_token (str): API token for authentication
158
+ username (str): Username to search for
159
+ account_id (str): Account ID to search for
160
+ max_results (int): Maximum results per page
161
+
162
+ Yields:
163
+ dict: The user data.
164
+ """
165
+ client = get_client(base_url, email, api_token)
166
+ yield from client.get_users(
167
+ username=username, account_id=account_id, max_results=max_results
168
+ )
169
+
170
+
171
+ @dlt.resource(write_disposition="replace")
172
+ def issue_types(
173
+ base_url: str = dlt.secrets.value,
174
+ email: str = dlt.secrets.value,
175
+ api_token: str = dlt.secrets.value,
176
+ ) -> Iterable[TDataItem]:
177
+ """
178
+ Fetches all issue types from Jira.
179
+
180
+ Args:
181
+ base_url (str): Jira instance URL
182
+ email (str): User email for authentication
183
+ api_token (str): API token for authentication
184
+
185
+ Yields:
186
+ dict: The issue type data.
187
+ """
188
+ client = get_client(base_url, email, api_token)
189
+ yield from client.get_issue_types()
190
+
191
+
192
+ @dlt.resource(write_disposition="replace")
193
+ def statuses(
194
+ base_url: str = dlt.secrets.value,
195
+ email: str = dlt.secrets.value,
196
+ api_token: str = dlt.secrets.value,
197
+ ) -> Iterable[TDataItem]:
198
+ """
199
+ Fetches all statuses from Jira.
200
+
201
+ Args:
202
+ base_url (str): Jira instance URL
203
+ email (str): User email for authentication
204
+ api_token (str): API token for authentication
205
+
206
+ Yields:
207
+ dict: The status data.
208
+ """
209
+ client = get_client(base_url, email, api_token)
210
+ yield from client.get_statuses()
211
+
212
+
213
+ @dlt.resource(write_disposition="replace")
214
+ def priorities(
215
+ base_url: str = dlt.secrets.value,
216
+ email: str = dlt.secrets.value,
217
+ api_token: str = dlt.secrets.value,
218
+ ) -> Iterable[TDataItem]:
219
+ """
220
+ Fetches all priorities from Jira.
221
+
222
+ Args:
223
+ base_url (str): Jira instance URL
224
+ email (str): User email for authentication
225
+ api_token (str): API token for authentication
226
+
227
+ Yields:
228
+ dict: The priority data.
229
+ """
230
+ client = get_client(base_url, email, api_token)
231
+ yield from client.get_priorities()
232
+
233
+
234
+ @dlt.resource(write_disposition="replace")
235
+ def resolutions(
236
+ base_url: str = dlt.secrets.value,
237
+ email: str = dlt.secrets.value,
238
+ api_token: str = dlt.secrets.value,
239
+ ) -> Iterable[TDataItem]:
240
+ """
241
+ Fetches all resolutions from Jira.
242
+
243
+ Args:
244
+ base_url (str): Jira instance URL
245
+ email (str): User email for authentication
246
+ api_token (str): API token for authentication
247
+
248
+ Yields:
249
+ dict: The resolution data.
250
+ """
251
+ client = get_client(base_url, email, api_token)
252
+ yield from client.get_resolutions()
253
+
254
+
255
+ @dlt.transformer(
256
+ data_from=projects,
257
+ write_disposition="replace",
258
+ )
259
+ @dlt.defer
260
+ def project_versions(
261
+ project: TDataItem,
262
+ base_url: str = dlt.secrets.value,
263
+ email: str = dlt.secrets.value,
264
+ api_token: str = dlt.secrets.value,
265
+ ) -> Iterable[TDataItem]:
266
+ """
267
+ Fetches versions for each project from Jira.
268
+
269
+ Args:
270
+ project (dict): The project data.
271
+ base_url (str): Jira instance URL
272
+ email (str): User email for authentication
273
+ api_token (str): API token for authentication
274
+
275
+ Returns:
276
+ list[dict]: The version data for the given project.
277
+ """
278
+ client = get_client(base_url, email, api_token)
279
+ project_key = project.get("key")
280
+ if not project_key:
281
+ return []
282
+
283
+ return list(client.get_project_versions(project_key))
284
+
285
+
286
+ @dlt.transformer(
287
+ data_from=projects,
288
+ write_disposition="replace",
289
+ )
290
+ @dlt.defer
291
+ def project_components(
292
+ project: TDataItem,
293
+ base_url: str = dlt.secrets.value,
294
+ email: str = dlt.secrets.value,
295
+ api_token: str = dlt.secrets.value,
296
+ ) -> Iterable[TDataItem]:
297
+ """
298
+ Fetches components for each project from Jira.
299
+
300
+ Args:
301
+ project (dict): The project data.
302
+ base_url (str): Jira instance URL
303
+ email (str): User email for authentication
304
+ api_token (str): API token for authentication
305
+
306
+ Returns:
307
+ list[dict]: The component data for the given project.
308
+ """
309
+ client = get_client(base_url, email, api_token)
310
+ project_key = project.get("key")
311
+ if not project_key:
312
+ return []
313
+
314
+ return list(client.get_project_components(project_key))