ingestr 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -0,0 +1,452 @@
1
+ """Jira source helpers"""
2
+
3
+ import base64
4
+ import logging
5
+ import time
6
+ from typing import Any, Dict, Iterator, Optional
7
+ from urllib.parse import urljoin
8
+
9
+ import requests
10
+
11
+ from .settings import API_BASE_PATH, DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE, REQUEST_TIMEOUT
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class JiraAPIError(Exception):
17
+ """Custom exception for Jira API errors."""
18
+
19
+ def __init__(
20
+ self,
21
+ message: str,
22
+ status_code: Optional[int] = None,
23
+ response_text: Optional[str] = None,
24
+ ):
25
+ super().__init__(message)
26
+ self.status_code = status_code
27
+ self.response_text = response_text
28
+
29
+
30
+ class JiraAuthenticationError(JiraAPIError):
31
+ """Exception raised for authentication failures."""
32
+
33
+ pass
34
+
35
+
36
+ class JiraRateLimitError(JiraAPIError):
37
+ """Exception raised when rate limit is exceeded."""
38
+
39
+ pass
40
+
41
+
42
+ class JiraClient:
43
+ """Jira REST API client with authentication and pagination support."""
44
+
45
+ def __init__(
46
+ self, base_url: str, email: str, api_token: str, timeout: int = REQUEST_TIMEOUT
47
+ ):
48
+ """
49
+ Initialize Jira client with basic auth.
50
+
51
+ Args:
52
+ base_url: Jira instance URL (e.g., https://your-domain.atlassian.net)
53
+ email: User email for authentication
54
+ api_token: API token for authentication
55
+ timeout: Request timeout in seconds
56
+ """
57
+ self.base_url = base_url.rstrip("/")
58
+ self.api_url = urljoin(self.base_url, API_BASE_PATH)
59
+ self.timeout = timeout
60
+
61
+ # Create basic auth header
62
+ credentials = f"{email}:{api_token}"
63
+ encoded_credentials = base64.b64encode(credentials.encode()).decode()
64
+
65
+ self.headers = {
66
+ "Authorization": f"Basic {encoded_credentials}",
67
+ "Accept": "application/json",
68
+ "Content-Type": "application/json",
69
+ }
70
+
71
+ def _make_request(
72
+ self,
73
+ endpoint: str,
74
+ params: Optional[Dict[str, Any]] = None,
75
+ method: str = "GET",
76
+ max_retries: int = 3,
77
+ backoff_factor: float = 1.0,
78
+ ) -> Dict[str, Any]:
79
+ """
80
+ Make HTTP request to Jira API with retry logic.
81
+
82
+ Args:
83
+ endpoint: API endpoint path
84
+ params: Query parameters
85
+ method: HTTP method
86
+ max_retries: Maximum number of retry attempts
87
+ backoff_factor: Factor for exponential backoff
88
+
89
+ Returns:
90
+ JSON response data
91
+
92
+ Raises:
93
+ JiraAPIError: If request fails after retries
94
+ JiraAuthenticationError: If authentication fails
95
+ JiraRateLimitError: If rate limit is exceeded
96
+ """
97
+ url = urljoin(self.api_url + "/", endpoint.lstrip("/"))
98
+
99
+ for attempt in range(max_retries + 1):
100
+ try:
101
+ logger.debug(f"Making request to {url} (attempt {attempt + 1})")
102
+
103
+ response = requests.request(
104
+ method=method,
105
+ url=url,
106
+ headers=self.headers,
107
+ params=params,
108
+ timeout=self.timeout,
109
+ )
110
+
111
+ # Handle different error status codes
112
+ if response.status_code == 401:
113
+ raise JiraAuthenticationError(
114
+ "Authentication failed. Please check your email and API token.",
115
+ status_code=response.status_code,
116
+ response_text=response.text,
117
+ )
118
+ elif response.status_code == 403:
119
+ raise JiraAuthenticationError(
120
+ "Access forbidden. Please check your permissions.",
121
+ status_code=response.status_code,
122
+ response_text=response.text,
123
+ )
124
+ elif response.status_code == 429:
125
+ # Rate limit exceeded
126
+ retry_after = int(response.headers.get("Retry-After", 60))
127
+ if attempt < max_retries:
128
+ logger.warning(
129
+ f"Rate limit exceeded. Waiting {retry_after} seconds before retry."
130
+ )
131
+ time.sleep(retry_after) # type: ignore
132
+ continue
133
+ else:
134
+ raise JiraRateLimitError(
135
+ f"Rate limit exceeded after {max_retries} retries.",
136
+ status_code=response.status_code,
137
+ response_text=response.text,
138
+ )
139
+ elif response.status_code >= 500:
140
+ # Server error - retry with backoff
141
+ if attempt < max_retries:
142
+ wait_time = backoff_factor * (2**attempt)
143
+ logger.warning(
144
+ f"Server error {response.status_code}. Retrying in {wait_time} seconds."
145
+ )
146
+ time.sleep(wait_time) # type: ignore
147
+ continue
148
+ else:
149
+ raise JiraAPIError(
150
+ f"Server error after {max_retries} retries.",
151
+ status_code=response.status_code,
152
+ response_text=response.text,
153
+ )
154
+
155
+ # Raise for other HTTP errors
156
+ response.raise_for_status()
157
+
158
+ # Try to parse JSON response
159
+ try:
160
+ return response.json()
161
+ except ValueError as e:
162
+ raise JiraAPIError(
163
+ f"Invalid JSON response: {str(e)}",
164
+ status_code=response.status_code,
165
+ response_text=response.text,
166
+ )
167
+
168
+ except requests.RequestException as e:
169
+ if attempt < max_retries:
170
+ wait_time = backoff_factor * (2**attempt)
171
+ logger.warning(
172
+ f"Request failed: {str(e)}. Retrying in {wait_time} seconds."
173
+ )
174
+ time.sleep(wait_time) # type: ignore
175
+ continue
176
+ else:
177
+ raise JiraAPIError(
178
+ f"Request failed after {max_retries} retries: {str(e)}"
179
+ )
180
+
181
+ raise JiraAPIError(f"Request failed after {max_retries} retries")
182
+
183
+ def get_paginated(
184
+ self,
185
+ endpoint: str,
186
+ params: Optional[Dict[str, Any]] = None,
187
+ page_size: int = DEFAULT_PAGE_SIZE,
188
+ max_results: Optional[int] = None,
189
+ ) -> Iterator[Dict[str, Any]]:
190
+ """
191
+ Get paginated results from Jira API with error handling.
192
+
193
+ Args:
194
+ endpoint: API endpoint path
195
+ params: Query parameters
196
+ page_size: Number of items per page
197
+ max_results: Maximum total results to return
198
+
199
+ Yields:
200
+ Individual items from paginated response
201
+
202
+ Raises:
203
+ JiraAPIError: If pagination fails
204
+ """
205
+ if params is None:
206
+ params = {}
207
+
208
+ # Validate page size
209
+ page_size = min(max(1, page_size), MAX_PAGE_SIZE)
210
+ params["maxResults"] = page_size
211
+ params["startAt"] = 0
212
+
213
+ total_returned = 0
214
+ consecutive_empty_pages = 0
215
+ max_empty_pages = 3
216
+
217
+ logger.info(
218
+ f"Starting paginated request to {endpoint} with page_size={page_size}"
219
+ )
220
+
221
+ while True:
222
+ try:
223
+ response = self._make_request(endpoint, params)
224
+
225
+ # Handle different response structures
226
+ if "values" in response:
227
+ items = response["values"]
228
+ total = response.get("total", len(items))
229
+ is_last = response.get("isLast", False)
230
+ elif "issues" in response:
231
+ items = response["issues"]
232
+ total = response.get("total", len(items))
233
+ is_last = len(items) < page_size
234
+ elif isinstance(response, list):
235
+ # Some endpoints return arrays directly
236
+ items = response
237
+ total = len(items)
238
+ is_last = True
239
+ else:
240
+ # Single item response
241
+ logger.debug(f"Received single item response from {endpoint}")
242
+ yield response
243
+ break
244
+
245
+ # Check for empty pages
246
+ if not items:
247
+ consecutive_empty_pages += 1
248
+ if consecutive_empty_pages >= max_empty_pages:
249
+ logger.warning(
250
+ f"Received {consecutive_empty_pages} consecutive empty pages, stopping pagination"
251
+ )
252
+ break
253
+ else:
254
+ consecutive_empty_pages = 0
255
+
256
+ logger.debug(
257
+ f"Retrieved {len(items)} items from {endpoint} (page {params['startAt'] // page_size + 1})"
258
+ )
259
+
260
+ for item in items:
261
+ if max_results and total_returned >= max_results:
262
+ logger.info(f"Reached max_results limit of {max_results}")
263
+ return
264
+ yield item
265
+ total_returned += 1
266
+
267
+ # Check if we've reached the end
268
+ if is_last or len(items) < page_size:
269
+ logger.debug(f"Reached end of pagination for {endpoint}")
270
+ break
271
+
272
+ # Check if we've got all available items
273
+ if total and total_returned >= total:
274
+ logger.debug(
275
+ f"Retrieved all {total} available items from {endpoint}"
276
+ )
277
+ break
278
+
279
+ # Move to next page
280
+ params["startAt"] += page_size
281
+
282
+ # Safety check to prevent infinite loops
283
+ if params["startAt"] > 100000: # Arbitrary large number
284
+ logger.warning(
285
+ f"Pagination safety limit reached for {endpoint}, stopping"
286
+ )
287
+ break
288
+
289
+ except JiraAPIError as e:
290
+ logger.error(f"API error during pagination of {endpoint}: {str(e)}")
291
+ raise
292
+ except Exception as e:
293
+ logger.error(
294
+ f"Unexpected error during pagination of {endpoint}: {str(e)}"
295
+ )
296
+ raise JiraAPIError(f"Pagination failed: {str(e)}")
297
+
298
+ logger.info(
299
+ f"Completed pagination for {endpoint}, returned {total_returned} items"
300
+ )
301
+
302
+ def search_issues(
303
+ self,
304
+ jql: str,
305
+ fields: Optional[str] = None,
306
+ expand: Optional[str] = None,
307
+ page_size: int = DEFAULT_PAGE_SIZE,
308
+ max_results: Optional[int] = None,
309
+ ) -> Iterator[Dict[str, Any]]:
310
+ """
311
+ Search for issues using JQL.
312
+
313
+ Args:
314
+ jql: JQL query string
315
+ fields: Comma-separated list of fields to return
316
+ expand: Comma-separated list of fields to expand
317
+ page_size: Number of items per page
318
+ max_results: Maximum total results to return
319
+
320
+ Yields:
321
+ Issue data
322
+ """
323
+ params = {"jql": jql}
324
+ if fields:
325
+ params["fields"] = fields
326
+ if expand:
327
+ params["expand"] = expand
328
+
329
+ yield from self.get_paginated(
330
+ "search", params=params, page_size=page_size, max_results=max_results
331
+ )
332
+
333
+ def get_projects(
334
+ self, expand: Optional[str] = None, recent: Optional[int] = None
335
+ ) -> Iterator[Dict[str, Any]]:
336
+ """
337
+ Get all projects.
338
+
339
+ Args:
340
+ expand: Comma-separated list of fields to expand
341
+ recent: Number of recent projects to return
342
+
343
+ Yields:
344
+ Project data
345
+ """
346
+ params = {}
347
+ if expand:
348
+ params["expand"] = expand
349
+ if recent:
350
+ params["recent"] = str(recent)
351
+
352
+ yield from self.get_paginated("project", params=params)
353
+
354
+ def get_users(
355
+ self,
356
+ username: Optional[str] = None,
357
+ account_id: Optional[str] = None,
358
+ start_at: int = 0,
359
+ max_results: int = DEFAULT_PAGE_SIZE,
360
+ ) -> Iterator[Dict[str, Any]]:
361
+ """
362
+ Get users.
363
+
364
+ Args:
365
+ username: Username to search for
366
+ account_id: Account ID to search for
367
+ start_at: Starting index
368
+ max_results: Maximum results per page
369
+
370
+ Yields:
371
+ User data
372
+ """
373
+ params = {
374
+ "startAt": str(start_at),
375
+ "maxResults": str(min(max_results, MAX_PAGE_SIZE)),
376
+ }
377
+ if username:
378
+ params["username"] = username
379
+ if account_id:
380
+ params["accountId"] = account_id
381
+
382
+ yield from self.get_paginated("users/search", params=params)
383
+
384
+ def get_issue_types(self) -> Iterator[Dict[str, Any]]:
385
+ """Get all issue types."""
386
+ response = self._make_request("issuetype")
387
+ if isinstance(response, list):
388
+ for issue_type in response:
389
+ yield issue_type
390
+
391
+ def get_statuses(self) -> Iterator[Dict[str, Any]]:
392
+ """Get all statuses."""
393
+ response = self._make_request("status")
394
+ if isinstance(response, list):
395
+ for status in response:
396
+ yield status
397
+
398
+ def get_priorities(self) -> Iterator[Dict[str, Any]]:
399
+ """Get all priorities."""
400
+ response = self._make_request("priority")
401
+ if isinstance(response, list):
402
+ for priority in response:
403
+ yield priority
404
+
405
+ def get_resolutions(self) -> Iterator[Dict[str, Any]]:
406
+ """Get all resolutions."""
407
+ response = self._make_request("resolution")
408
+ if isinstance(response, list):
409
+ for resolution in response:
410
+ yield resolution
411
+
412
+ def get_project_versions(self, project_key: str) -> Iterator[Dict[str, Any]]:
413
+ """
414
+ Get versions for a specific project.
415
+
416
+ Args:
417
+ project_key: Project key
418
+
419
+ Yields:
420
+ Version data
421
+ """
422
+ yield from self.get_paginated(f"project/{project_key}/version")
423
+
424
+ def get_project_components(self, project_key: str) -> Iterator[Dict[str, Any]]:
425
+ """
426
+ Get components for a specific project.
427
+
428
+ Args:
429
+ project_key: Project key
430
+
431
+ Yields:
432
+ Component data
433
+ """
434
+ yield from self.get_paginated(f"project/{project_key}/component")
435
+
436
+
437
+ def get_client(
438
+ base_url: str, email: str, api_token: str, timeout: int = REQUEST_TIMEOUT
439
+ ) -> JiraClient:
440
+ """
441
+ Create and return a Jira API client.
442
+
443
+ Args:
444
+ base_url: Jira instance URL
445
+ email: User email for authentication
446
+ api_token: API token for authentication
447
+ timeout: Request timeout in seconds
448
+
449
+ Returns:
450
+ JiraClient instance
451
+ """
452
+ return JiraClient(base_url, email, api_token, timeout)
@@ -0,0 +1,170 @@
1
+ """Jira source settings and constants"""
2
+
3
+ # Default start date for Jira API requests
4
+ DEFAULT_START_DATE = "2010-01-01"
5
+
6
+ # Jira API request timeout in seconds
7
+ REQUEST_TIMEOUT = 300
8
+
9
+ # Default page size for paginated requests
10
+ DEFAULT_PAGE_SIZE = 100
11
+
12
+ # Maximum page size allowed by Jira API
13
+ MAX_PAGE_SIZE = 1000
14
+
15
+ # Base API path for Jira Cloud
16
+ API_BASE_PATH = "/rest/api/3"
17
+
18
+ # Project fields to retrieve from Jira API
19
+ PROJECT_FIELDS = (
20
+ "id",
21
+ "key",
22
+ "name",
23
+ "description",
24
+ "lead",
25
+ "projectCategory",
26
+ "projectTypeKey",
27
+ "simplified",
28
+ "style",
29
+ "favourite",
30
+ "isPrivate",
31
+ "properties",
32
+ "entityId",
33
+ "uuid",
34
+ "insight",
35
+ )
36
+
37
+ # Issue fields to retrieve from Jira API
38
+ ISSUE_FIELDS = (
39
+ "id",
40
+ "key",
41
+ "summary",
42
+ "description",
43
+ "issuetype",
44
+ "status",
45
+ "priority",
46
+ "resolution",
47
+ "assignee",
48
+ "reporter",
49
+ "creator",
50
+ "created",
51
+ "updated",
52
+ "resolutiondate",
53
+ "duedate",
54
+ "components",
55
+ "fixVersions",
56
+ "versions",
57
+ "labels",
58
+ "environment",
59
+ "project",
60
+ "parent",
61
+ "subtasks",
62
+ "issuelinks",
63
+ "votes",
64
+ "watches",
65
+ "worklog",
66
+ "attachments",
67
+ "comment",
68
+ "customfield_*",
69
+ )
70
+
71
+ # User fields to retrieve from Jira API
72
+ USER_FIELDS = (
73
+ "accountId",
74
+ "accountType",
75
+ "emailAddress",
76
+ "displayName",
77
+ "active",
78
+ "timeZone",
79
+ "groups",
80
+ "applicationRoles",
81
+ "expand",
82
+ )
83
+
84
+ # Board fields to retrieve from Jira API (for Agile/Scrum boards)
85
+ BOARD_FIELDS = (
86
+ "id",
87
+ "name",
88
+ "type",
89
+ "location",
90
+ "filter",
91
+ "subQuery",
92
+ )
93
+
94
+ # Sprint fields to retrieve from Jira API
95
+ SPRINT_FIELDS = (
96
+ "id",
97
+ "name",
98
+ "state",
99
+ "startDate",
100
+ "endDate",
101
+ "completeDate",
102
+ "originBoardId",
103
+ "goal",
104
+ )
105
+
106
+ # Issue type fields to retrieve from Jira API
107
+ ISSUE_TYPE_FIELDS = (
108
+ "id",
109
+ "name",
110
+ "description",
111
+ "iconUrl",
112
+ "subtask",
113
+ "avatarId",
114
+ "hierarchyLevel",
115
+ )
116
+
117
+ # Status fields to retrieve from Jira API
118
+ STATUS_FIELDS = (
119
+ "id",
120
+ "name",
121
+ "description",
122
+ "iconUrl",
123
+ "statusCategory",
124
+ )
125
+
126
+ # Priority fields to retrieve from Jira API
127
+ PRIORITY_FIELDS = (
128
+ "id",
129
+ "name",
130
+ "description",
131
+ "iconUrl",
132
+ )
133
+
134
+ # Resolution fields to retrieve from Jira API
135
+ RESOLUTION_FIELDS = (
136
+ "id",
137
+ "name",
138
+ "description",
139
+ )
140
+
141
+ # Version fields to retrieve from Jira API
142
+ VERSION_FIELDS = (
143
+ "id",
144
+ "name",
145
+ "description",
146
+ "archived",
147
+ "released",
148
+ "startDate",
149
+ "releaseDate",
150
+ "overdue",
151
+ "userStartDate",
152
+ "userReleaseDate",
153
+ "project",
154
+ "projectId",
155
+ )
156
+
157
+ # Component fields to retrieve from Jira API
158
+ COMPONENT_FIELDS = (
159
+ "id",
160
+ "name",
161
+ "description",
162
+ "lead",
163
+ "assigneeType",
164
+ "assignee",
165
+ "realAssigneeType",
166
+ "realAssignee",
167
+ "isAssigneeTypeValid",
168
+ "project",
169
+ "projectId",
170
+ )
@@ -962,16 +962,46 @@ def process_file_items(file_path: str) -> list[dict]:
962
962
  return documents
963
963
 
964
964
 
965
- def mongodb_insert(uri: str, database: str):
965
+ def mongodb_insert(uri: str):
966
966
  """Creates a dlt.destination for inserting data into a MongoDB collection.
967
967
 
968
968
  Args:
969
- uri (str): MongoDB connection URI.
970
- database (str): Name of the MongoDB database.
969
+ uri (str): MongoDB connection URI including database.
971
970
 
972
971
  Returns:
973
972
  dlt.destination: A DLT destination object configured for MongoDB.
974
973
  """
974
+ from urllib.parse import urlparse
975
+
976
+ parsed_uri = urlparse(uri)
977
+
978
+ # Handle both mongodb:// and mongodb+srv:// schemes
979
+ if uri.startswith("mongodb+srv://") or uri.startswith("mongodb://"):
980
+ # For modern connection strings (MongoDB Atlas), use the URI as-is
981
+ connection_string = uri
982
+ # Extract database from path or use default
983
+ database = (
984
+ parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
985
+ )
986
+ else:
987
+ # Legacy handling for backwards compatibility
988
+ host = parsed_uri.hostname or "localhost"
989
+ port = parsed_uri.port or 27017
990
+ username = parsed_uri.username
991
+ password = parsed_uri.password
992
+ database = (
993
+ parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
994
+ )
995
+
996
+ # Build connection string
997
+ if username and password:
998
+ connection_string = f"mongodb://{username}:{password}@{host}:{port}"
999
+ else:
1000
+ connection_string = f"mongodb://{host}:{port}"
1001
+
1002
+ # Add query parameters if any
1003
+ if parsed_uri.query:
1004
+ connection_string += f"?{parsed_uri.query}"
975
1005
 
976
1006
  state = {"first_batch": True}
977
1007
 
@@ -984,9 +1014,7 @@ def mongodb_insert(uri: str, database: str):
984
1014
  collection_name = table["name"]
985
1015
 
986
1016
  # Connect to MongoDB
987
- client: MongoClient
988
-
989
- with MongoClient(uri) as client:
1017
+ with MongoClient(connection_string) as client:
990
1018
  db = client[database]
991
1019
  collection = db[collection_name]
992
1020