PyPI - ingestr - Versions diffs - 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl - Mend

ingestr 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (19) hide show

ingestr/src/buildinfo.py +1 -1
ingestr/src/destinations.py +1 -24
ingestr/src/elasticsearch/helpers.py +35 -9
ingestr/src/factory.py +2 -0
ingestr/src/fluxx/__init__.py +9 -0
ingestr/src/freshdesk/__init__.py +2 -0
ingestr/src/freshdesk/freshdesk_client.py +15 -1
ingestr/src/hubspot/__init__.py +6 -12
ingestr/src/intercom/settings.py +3 -1
ingestr/src/jira_source/__init__.py +314 -0
ingestr/src/jira_source/helpers.py +452 -0
ingestr/src/jira_source/settings.py +170 -0
ingestr/src/mongodb/helpers.py +34 -6
ingestr/src/sources.py +55 -0
{ingestr-0.14.2.dist-info → ingestr-0.14.4.dist-info}/METADATA +1 -1
{ingestr-0.14.2.dist-info → ingestr-0.14.4.dist-info}/RECORD +19 -16
{ingestr-0.14.2.dist-info → ingestr-0.14.4.dist-info}/WHEEL +0 -0
{ingestr-0.14.2.dist-info → ingestr-0.14.4.dist-info}/entry_points.txt +0 -0
{ingestr-0.14.2.dist-info → ingestr-0.14.4.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/jira_source/helpers.py ADDED Viewed

@@ -0,0 +1,452 @@
+"""Jira source helpers"""
+import base64
+import logging
+import time
+from typing import Any, Dict, Iterator, Optional
+from urllib.parse import urljoin
+import requests
+from .settings import API_BASE_PATH, DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE, REQUEST_TIMEOUT
+logger = logging.getLogger(__name__)
+class JiraAPIError(Exception):
+    """Custom exception for Jira API errors."""
+    def __init__(
+        self,
+        message: str,
+        status_code: Optional[int] = None,
+        response_text: Optional[str] = None,
+    ):
+        super().__init__(message)
+        self.status_code = status_code
+        self.response_text = response_text
+class JiraAuthenticationError(JiraAPIError):
+    """Exception raised for authentication failures."""
+    pass
+class JiraRateLimitError(JiraAPIError):
+    """Exception raised when rate limit is exceeded."""
+    pass
+class JiraClient:
+    """Jira REST API client with authentication and pagination support."""
+    def __init__(
+        self, base_url: str, email: str, api_token: str, timeout: int = REQUEST_TIMEOUT
+    ):
+        """
+        Initialize Jira client with basic auth.
+        Args:
+            base_url: Jira instance URL (e.g., https://your-domain.atlassian.net)
+            email: User email for authentication
+            api_token: API token for authentication
+            timeout: Request timeout in seconds
+        """
+        self.base_url = base_url.rstrip("/")
+        self.api_url = urljoin(self.base_url, API_BASE_PATH)
+        self.timeout = timeout
+        # Create basic auth header
+        credentials = f"{email}:{api_token}"
+        encoded_credentials = base64.b64encode(credentials.encode()).decode()
+        self.headers = {
+            "Authorization": f"Basic {encoded_credentials}",
+            "Accept": "application/json",
+            "Content-Type": "application/json",
+        }
+    def _make_request(
+        self,
+        endpoint: str,
+        params: Optional[Dict[str, Any]] = None,
+        method: str = "GET",
+        max_retries: int = 3,
+        backoff_factor: float = 1.0,
+    ) -> Dict[str, Any]:
+        """
+        Make HTTP request to Jira API with retry logic.
+        Args:
+            endpoint: API endpoint path
+            params: Query parameters
+            method: HTTP method
+            max_retries: Maximum number of retry attempts
+            backoff_factor: Factor for exponential backoff
+        Returns:
+            JSON response data
+        Raises:
+            JiraAPIError: If request fails after retries
+            JiraAuthenticationError: If authentication fails
+            JiraRateLimitError: If rate limit is exceeded
+        """
+        url = urljoin(self.api_url + "/", endpoint.lstrip("/"))
+        for attempt in range(max_retries + 1):
+            try:
+                logger.debug(f"Making request to {url} (attempt {attempt + 1})")
+                response = requests.request(
+                    method=method,
+                    url=url,
+                    headers=self.headers,
+                    params=params,
+                    timeout=self.timeout,
+                )
+                # Handle different error status codes
+                if response.status_code == 401:
+                    raise JiraAuthenticationError(
+                        "Authentication failed. Please check your email and API token.",
+                        status_code=response.status_code,
+                        response_text=response.text,
+                    )
+                elif response.status_code == 403:
+                    raise JiraAuthenticationError(
+                        "Access forbidden. Please check your permissions.",
+                        status_code=response.status_code,
+                        response_text=response.text,
+                    )
+                elif response.status_code == 429:
+                    # Rate limit exceeded
+                    retry_after = int(response.headers.get("Retry-After", 60))
+                    if attempt < max_retries:
+                        logger.warning(
+                            f"Rate limit exceeded. Waiting {retry_after} seconds before retry."
+                        )
+                        time.sleep(retry_after)  # type: ignore
+                        continue
+                    else:
+                        raise JiraRateLimitError(
+                            f"Rate limit exceeded after {max_retries} retries.",
+                            status_code=response.status_code,
+                            response_text=response.text,
+                        )
+                elif response.status_code >= 500:
+                    # Server error - retry with backoff
+                    if attempt < max_retries:
+                        wait_time = backoff_factor * (2**attempt)
+                        logger.warning(
+                            f"Server error {response.status_code}. Retrying in {wait_time} seconds."
+                        )
+                        time.sleep(wait_time)  # type: ignore
+                        continue
+                    else:
+                        raise JiraAPIError(
+                            f"Server error after {max_retries} retries.",
+                            status_code=response.status_code,
+                            response_text=response.text,
+                        )
+                # Raise for other HTTP errors
+                response.raise_for_status()
+                # Try to parse JSON response
+                try:
+                    return response.json()
+                except ValueError as e:
+                    raise JiraAPIError(
+                        f"Invalid JSON response: {str(e)}",
+                        status_code=response.status_code,
+                        response_text=response.text,
+                    )
+            except requests.RequestException as e:
+                if attempt < max_retries:
+                    wait_time = backoff_factor * (2**attempt)
+                    logger.warning(
+                        f"Request failed: {str(e)}. Retrying in {wait_time} seconds."
+                    )
+                    time.sleep(wait_time)  # type: ignore
+                    continue
+                else:
+                    raise JiraAPIError(
+                        f"Request failed after {max_retries} retries: {str(e)}"
+                    )
+        raise JiraAPIError(f"Request failed after {max_retries} retries")
+    def get_paginated(
+        self,
+        endpoint: str,
+        params: Optional[Dict[str, Any]] = None,
+        page_size: int = DEFAULT_PAGE_SIZE,
+        max_results: Optional[int] = None,
+    ) -> Iterator[Dict[str, Any]]:
+        """
+        Get paginated results from Jira API with error handling.
+        Args:
+            endpoint: API endpoint path
+            params: Query parameters
+            page_size: Number of items per page
+            max_results: Maximum total results to return
+        Yields:
+            Individual items from paginated response
+        Raises:
+            JiraAPIError: If pagination fails
+        """
+        if params is None:
+            params = {}
+        # Validate page size
+        page_size = min(max(1, page_size), MAX_PAGE_SIZE)
+        params["maxResults"] = page_size
+        params["startAt"] = 0
+        total_returned = 0
+        consecutive_empty_pages = 0
+        max_empty_pages = 3
+        logger.info(
+            f"Starting paginated request to {endpoint} with page_size={page_size}"
+        )
+        while True:
+            try:
+                response = self._make_request(endpoint, params)
+                # Handle different response structures
+                if "values" in response:
+                    items = response["values"]
+                    total = response.get("total", len(items))
+                    is_last = response.get("isLast", False)
+                elif "issues" in response:
+                    items = response["issues"]
+                    total = response.get("total", len(items))
+                    is_last = len(items) < page_size
+                elif isinstance(response, list):
+                    # Some endpoints return arrays directly
+                    items = response
+                    total = len(items)
+                    is_last = True
+                else:
+                    # Single item response
+                    logger.debug(f"Received single item response from {endpoint}")
+                    yield response
+                    break
+                # Check for empty pages
+                if not items:
+                    consecutive_empty_pages += 1
+                    if consecutive_empty_pages >= max_empty_pages:
+                        logger.warning(
+                            f"Received {consecutive_empty_pages} consecutive empty pages, stopping pagination"
+                        )
+                        break
+                else:
+                    consecutive_empty_pages = 0
+                logger.debug(
+                    f"Retrieved {len(items)} items from {endpoint} (page {params['startAt'] // page_size + 1})"
+                )
+                for item in items:
+                    if max_results and total_returned >= max_results:
+                        logger.info(f"Reached max_results limit of {max_results}")
+                        return
+                    yield item
+                    total_returned += 1
+                # Check if we've reached the end
+                if is_last or len(items) < page_size:
+                    logger.debug(f"Reached end of pagination for {endpoint}")
+                    break
+                # Check if we've got all available items
+                if total and total_returned >= total:
+                    logger.debug(
+                        f"Retrieved all {total} available items from {endpoint}"
+                    )
+                    break
+                # Move to next page
+                params["startAt"] += page_size
+                # Safety check to prevent infinite loops
+                if params["startAt"] > 100000:  # Arbitrary large number
+                    logger.warning(
+                        f"Pagination safety limit reached for {endpoint}, stopping"
+                    )
+                    break
+            except JiraAPIError as e:
+                logger.error(f"API error during pagination of {endpoint}: {str(e)}")
+                raise
+            except Exception as e:
+                logger.error(
+                    f"Unexpected error during pagination of {endpoint}: {str(e)}"
+                )
+                raise JiraAPIError(f"Pagination failed: {str(e)}")
+        logger.info(
+            f"Completed pagination for {endpoint}, returned {total_returned} items"
+        )
+    def search_issues(
+        self,
+        jql: str,
+        fields: Optional[str] = None,
+        expand: Optional[str] = None,
+        page_size: int = DEFAULT_PAGE_SIZE,
+        max_results: Optional[int] = None,
+    ) -> Iterator[Dict[str, Any]]:
+        """
+        Search for issues using JQL.
+        Args:
+            jql: JQL query string
+            fields: Comma-separated list of fields to return
+            expand: Comma-separated list of fields to expand
+            page_size: Number of items per page
+            max_results: Maximum total results to return
+        Yields:
+            Issue data
+        """
+        params = {"jql": jql}
+        if fields:
+            params["fields"] = fields
+        if expand:
+            params["expand"] = expand
+        yield from self.get_paginated(
+            "search", params=params, page_size=page_size, max_results=max_results
+        )
+    def get_projects(
+        self, expand: Optional[str] = None, recent: Optional[int] = None
+    ) -> Iterator[Dict[str, Any]]:
+        """
+        Get all projects.
+        Args:
+            expand: Comma-separated list of fields to expand
+            recent: Number of recent projects to return
+        Yields:
+            Project data
+        """
+        params = {}
+        if expand:
+            params["expand"] = expand
+        if recent:
+            params["recent"] = str(recent)
+        yield from self.get_paginated("project", params=params)
+    def get_users(
+        self,
+        username: Optional[str] = None,
+        account_id: Optional[str] = None,
+        start_at: int = 0,
+        max_results: int = DEFAULT_PAGE_SIZE,
+    ) -> Iterator[Dict[str, Any]]:
+        """
+        Get users.
+        Args:
+            username: Username to search for
+            account_id: Account ID to search for
+            start_at: Starting index
+            max_results: Maximum results per page
+        Yields:
+            User data
+        """
+        params = {
+            "startAt": str(start_at),
+            "maxResults": str(min(max_results, MAX_PAGE_SIZE)),
+        }
+        if username:
+            params["username"] = username
+        if account_id:
+            params["accountId"] = account_id
+        yield from self.get_paginated("users/search", params=params)
+    def get_issue_types(self) -> Iterator[Dict[str, Any]]:
+        """Get all issue types."""
+        response = self._make_request("issuetype")
+        if isinstance(response, list):
+            for issue_type in response:
+                yield issue_type
+    def get_statuses(self) -> Iterator[Dict[str, Any]]:
+        """Get all statuses."""
+        response = self._make_request("status")
+        if isinstance(response, list):
+            for status in response:
+                yield status
+    def get_priorities(self) -> Iterator[Dict[str, Any]]:
+        """Get all priorities."""
+        response = self._make_request("priority")
+        if isinstance(response, list):
+            for priority in response:
+                yield priority
+    def get_resolutions(self) -> Iterator[Dict[str, Any]]:
+        """Get all resolutions."""
+        response = self._make_request("resolution")
+        if isinstance(response, list):
+            for resolution in response:
+                yield resolution
+    def get_project_versions(self, project_key: str) -> Iterator[Dict[str, Any]]:
+        """
+        Get versions for a specific project.
+        Args:
+            project_key: Project key
+        Yields:
+            Version data
+        """
+        yield from self.get_paginated(f"project/{project_key}/version")
+    def get_project_components(self, project_key: str) -> Iterator[Dict[str, Any]]:
+        """
+        Get components for a specific project.
+        Args:
+            project_key: Project key
+        Yields:
+            Component data
+        """
+        yield from self.get_paginated(f"project/{project_key}/component")
+def get_client(
+    base_url: str, email: str, api_token: str, timeout: int = REQUEST_TIMEOUT
+) -> JiraClient:
+    """
+    Create and return a Jira API client.
+    Args:
+        base_url: Jira instance URL
+        email: User email for authentication
+        api_token: API token for authentication
+        timeout: Request timeout in seconds
+    Returns:
+        JiraClient instance
+    """
+    return JiraClient(base_url, email, api_token, timeout)

ingestr/src/jira_source/settings.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""Jira source settings and constants"""
+# Default start date for Jira API requests
+DEFAULT_START_DATE = "2010-01-01"
+# Jira API request timeout in seconds
+REQUEST_TIMEOUT = 300
+# Default page size for paginated requests
+DEFAULT_PAGE_SIZE = 100
+# Maximum page size allowed by Jira API
+MAX_PAGE_SIZE = 1000
+# Base API path for Jira Cloud
+API_BASE_PATH = "/rest/api/3"
+# Project fields to retrieve from Jira API
+PROJECT_FIELDS = (
+    "id",
+    "key",
+    "name",
+    "description",
+    "lead",
+    "projectCategory",
+    "projectTypeKey",
+    "simplified",
+    "style",
+    "favourite",
+    "isPrivate",
+    "properties",
+    "entityId",
+    "uuid",
+    "insight",
+)
+# Issue fields to retrieve from Jira API
+ISSUE_FIELDS = (
+    "id",
+    "key",
+    "summary",
+    "description",
+    "issuetype",
+    "status",
+    "priority",
+    "resolution",
+    "assignee",
+    "reporter",
+    "creator",
+    "created",
+    "updated",
+    "resolutiondate",
+    "duedate",
+    "components",
+    "fixVersions",
+    "versions",
+    "labels",
+    "environment",
+    "project",
+    "parent",
+    "subtasks",
+    "issuelinks",
+    "votes",
+    "watches",
+    "worklog",
+    "attachments",
+    "comment",
+    "customfield_*",
+)
+# User fields to retrieve from Jira API
+USER_FIELDS = (
+    "accountId",
+    "accountType",
+    "emailAddress",
+    "displayName",
+    "active",
+    "timeZone",
+    "groups",
+    "applicationRoles",
+    "expand",
+)
+# Board fields to retrieve from Jira API (for Agile/Scrum boards)
+BOARD_FIELDS = (
+    "id",
+    "name",
+    "type",
+    "location",
+    "filter",
+    "subQuery",
+)
+# Sprint fields to retrieve from Jira API
+SPRINT_FIELDS = (
+    "id",
+    "name",
+    "state",
+    "startDate",
+    "endDate",
+    "completeDate",
+    "originBoardId",
+    "goal",
+)
+# Issue type fields to retrieve from Jira API
+ISSUE_TYPE_FIELDS = (
+    "id",
+    "name",
+    "description",
+    "iconUrl",
+    "subtask",
+    "avatarId",
+    "hierarchyLevel",
+)
+# Status fields to retrieve from Jira API
+STATUS_FIELDS = (
+    "id",
+    "name",
+    "description",
+    "iconUrl",
+    "statusCategory",
+)
+# Priority fields to retrieve from Jira API
+PRIORITY_FIELDS = (
+    "id",
+    "name",
+    "description",
+    "iconUrl",
+)
+# Resolution fields to retrieve from Jira API
+RESOLUTION_FIELDS = (
+    "id",
+    "name",
+    "description",
+)
+# Version fields to retrieve from Jira API
+VERSION_FIELDS = (
+    "id",
+    "name",
+    "description",
+    "archived",
+    "released",
+    "startDate",
+    "releaseDate",
+    "overdue",
+    "userStartDate",
+    "userReleaseDate",
+    "project",
+    "projectId",
+)
+# Component fields to retrieve from Jira API
+COMPONENT_FIELDS = (
+    "id",
+    "name",
+    "description",
+    "lead",
+    "assigneeType",
+    "assignee",
+    "realAssigneeType",
+    "realAssignee",
+    "isAssigneeTypeValid",
+    "project",
+    "projectId",
+)

ingestr/src/mongodb/helpers.py CHANGED Viewed

@@ -962,16 +962,46 @@ def process_file_items(file_path: str) -> list[dict]:
     return documents
-def mongodb_insert(uri: str, database: str):
+def mongodb_insert(uri: str):
     """Creates a dlt.destination for inserting data into a MongoDB collection.
     Args:
-        uri (str): MongoDB connection URI.
-        database (str): Name of the MongoDB database.
+        uri (str): MongoDB connection URI including database.
     Returns:
         dlt.destination: A DLT destination object configured for MongoDB.
     """
+    from urllib.parse import urlparse
+    parsed_uri = urlparse(uri)
+    # Handle both mongodb:// and mongodb+srv:// schemes
+    if uri.startswith("mongodb+srv://") or uri.startswith("mongodb://"):
+        # For modern connection strings (MongoDB Atlas), use the URI as-is
+        connection_string = uri
+        # Extract database from path or use default
+        database = (
+            parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
+        )
+    else:
+        # Legacy handling for backwards compatibility
+        host = parsed_uri.hostname or "localhost"
+        port = parsed_uri.port or 27017
+        username = parsed_uri.username
+        password = parsed_uri.password
+        database = (
+            parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
+        )
+        # Build connection string
+        if username and password:
+            connection_string = f"mongodb://{username}:{password}@{host}:{port}"
+        else:
+            connection_string = f"mongodb://{host}:{port}"
+        # Add query parameters if any
+        if parsed_uri.query:
+            connection_string += f"?{parsed_uri.query}"
     state = {"first_batch": True}
@@ -984,9 +1014,7 @@ def mongodb_insert(uri: str, database: str):
         collection_name = table["name"]
         # Connect to MongoDB
-        client: MongoClient
-        with MongoClient(uri) as client:
+        with MongoClient(connection_string) as client:
             db = client[database]
             collection = db[collection_name]

ingestr 0.14.2__py3-none-any.whl → 0.14.4__py3-none-any.whl

Potentially problematic release.

ingestr 0.14.2py3-none-any.whl → 0.14.4py3-none-any.whl