PyPI - ingestr - Versions diffs - 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl - Mend

ingestr 0.13.2py3-none-any.whl → 0.14.104py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

ingestr/conftest.py +72 -0
ingestr/main.py +134 -87
ingestr/src/adjust/__init__.py +4 -4
ingestr/src/adjust/adjust_helpers.py +7 -3
ingestr/src/airtable/__init__.py +3 -2
ingestr/src/allium/__init__.py +128 -0
ingestr/src/anthropic/__init__.py +277 -0
ingestr/src/anthropic/helpers.py +525 -0
ingestr/src/applovin/__init__.py +262 -0
ingestr/src/applovin_max/__init__.py +117 -0
ingestr/src/appsflyer/__init__.py +325 -0
ingestr/src/appsflyer/client.py +49 -45
ingestr/src/appstore/__init__.py +1 -0
ingestr/src/arrow/__init__.py +9 -1
ingestr/src/asana_source/__init__.py +1 -1
ingestr/src/attio/__init__.py +102 -0
ingestr/src/attio/helpers.py +65 -0
ingestr/src/blob.py +38 -11
ingestr/src/buildinfo.py +1 -0
ingestr/src/chess/__init__.py +1 -1
ingestr/src/clickup/__init__.py +85 -0
ingestr/src/clickup/helpers.py +47 -0
ingestr/src/collector/spinner.py +43 -0
ingestr/src/couchbase_source/__init__.py +118 -0
ingestr/src/couchbase_source/helpers.py +135 -0
ingestr/src/cursor/__init__.py +83 -0
ingestr/src/cursor/helpers.py +188 -0
ingestr/src/destinations.py +520 -33
ingestr/src/docebo/__init__.py +589 -0
ingestr/src/docebo/client.py +435 -0
ingestr/src/docebo/helpers.py +97 -0
ingestr/src/elasticsearch/__init__.py +80 -0
ingestr/src/elasticsearch/helpers.py +138 -0
ingestr/src/errors.py +8 -0
ingestr/src/facebook_ads/__init__.py +47 -28
ingestr/src/facebook_ads/helpers.py +59 -37
ingestr/src/facebook_ads/settings.py +2 -0
ingestr/src/facebook_ads/utils.py +39 -0
ingestr/src/factory.py +116 -2
ingestr/src/filesystem/__init__.py +8 -3
ingestr/src/filters.py +46 -3
ingestr/src/fluxx/__init__.py +9906 -0
ingestr/src/fluxx/helpers.py +209 -0
ingestr/src/frankfurter/__init__.py +157 -0
ingestr/src/frankfurter/helpers.py +48 -0
ingestr/src/freshdesk/__init__.py +89 -0
ingestr/src/freshdesk/freshdesk_client.py +137 -0
ingestr/src/freshdesk/settings.py +9 -0
ingestr/src/fundraiseup/__init__.py +95 -0
ingestr/src/fundraiseup/client.py +81 -0
ingestr/src/github/__init__.py +41 -6
ingestr/src/github/helpers.py +5 -5
ingestr/src/google_analytics/__init__.py +22 -4
ingestr/src/google_analytics/helpers.py +124 -6
ingestr/src/google_sheets/__init__.py +4 -4
ingestr/src/google_sheets/helpers/data_processing.py +2 -2
ingestr/src/hostaway/__init__.py +302 -0
ingestr/src/hostaway/client.py +288 -0
ingestr/src/http/__init__.py +35 -0
ingestr/src/http/readers.py +114 -0
ingestr/src/http_client.py +24 -0
ingestr/src/hubspot/__init__.py +66 -23
ingestr/src/hubspot/helpers.py +52 -22
ingestr/src/hubspot/settings.py +14 -7
ingestr/src/influxdb/__init__.py +46 -0
ingestr/src/influxdb/client.py +34 -0
ingestr/src/intercom/__init__.py +142 -0
ingestr/src/intercom/helpers.py +674 -0
ingestr/src/intercom/settings.py +279 -0
ingestr/src/isoc_pulse/__init__.py +159 -0
ingestr/src/jira_source/__init__.py +340 -0
ingestr/src/jira_source/helpers.py +439 -0
ingestr/src/jira_source/settings.py +170 -0
ingestr/src/kafka/__init__.py +4 -1
ingestr/src/kinesis/__init__.py +139 -0
ingestr/src/kinesis/helpers.py +82 -0
ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
ingestr/src/linear/__init__.py +634 -0
ingestr/src/linear/helpers.py +111 -0
ingestr/src/linkedin_ads/helpers.py +0 -1
ingestr/src/loader.py +69 -0
ingestr/src/mailchimp/__init__.py +126 -0
ingestr/src/mailchimp/helpers.py +226 -0
ingestr/src/mailchimp/settings.py +164 -0
ingestr/src/masking.py +344 -0
ingestr/src/mixpanel/__init__.py +62 -0
ingestr/src/mixpanel/client.py +99 -0
ingestr/src/monday/__init__.py +246 -0
ingestr/src/monday/helpers.py +392 -0
ingestr/src/monday/settings.py +328 -0
ingestr/src/mongodb/__init__.py +72 -8
ingestr/src/mongodb/helpers.py +915 -38
ingestr/src/partition.py +32 -0
ingestr/src/personio/__init__.py +331 -0
ingestr/src/personio/helpers.py +86 -0
ingestr/src/phantombuster/__init__.py +65 -0
ingestr/src/phantombuster/client.py +87 -0
ingestr/src/pinterest/__init__.py +82 -0
ingestr/src/pipedrive/__init__.py +198 -0
ingestr/src/pipedrive/helpers/__init__.py +23 -0
ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
ingestr/src/pipedrive/helpers/pages.py +115 -0
ingestr/src/pipedrive/settings.py +27 -0
ingestr/src/pipedrive/typing.py +3 -0
ingestr/src/plusvibeai/__init__.py +335 -0
ingestr/src/plusvibeai/helpers.py +544 -0
ingestr/src/plusvibeai/settings.py +252 -0
ingestr/src/quickbooks/__init__.py +117 -0
ingestr/src/resource.py +40 -0
ingestr/src/revenuecat/__init__.py +83 -0
ingestr/src/revenuecat/helpers.py +237 -0
ingestr/src/salesforce/__init__.py +156 -0
ingestr/src/salesforce/helpers.py +64 -0
ingestr/src/shopify/__init__.py +1 -17
ingestr/src/smartsheets/__init__.py +82 -0
ingestr/src/snapchat_ads/__init__.py +489 -0
ingestr/src/snapchat_ads/client.py +72 -0
ingestr/src/snapchat_ads/helpers.py +535 -0
ingestr/src/socrata_source/__init__.py +83 -0
ingestr/src/socrata_source/helpers.py +85 -0
ingestr/src/socrata_source/settings.py +8 -0
ingestr/src/solidgate/__init__.py +219 -0
ingestr/src/solidgate/helpers.py +154 -0
ingestr/src/sources.py +3132 -212
ingestr/src/stripe_analytics/__init__.py +49 -21
ingestr/src/stripe_analytics/helpers.py +286 -1
ingestr/src/stripe_analytics/settings.py +62 -10
ingestr/src/telemetry/event.py +10 -9
ingestr/src/tiktok_ads/__init__.py +12 -6
ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
ingestr/src/trustpilot/__init__.py +48 -0
ingestr/src/trustpilot/client.py +48 -0
ingestr/src/version.py +6 -1
ingestr/src/wise/__init__.py +68 -0
ingestr/src/wise/client.py +63 -0
ingestr/src/zoom/__init__.py +99 -0
ingestr/src/zoom/helpers.py +102 -0
ingestr/tests/unit/test_smartsheets.py +133 -0
ingestr-0.14.104.dist-info/METADATA +563 -0
ingestr-0.14.104.dist-info/RECORD +203 -0
ingestr/src/appsflyer/_init_.py +0 -24
ingestr-0.13.2.dist-info/METADATA +0 -302
ingestr-0.13.2.dist-info/RECORD +0 -107
{ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
{ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/docebo/client.py ADDED Viewed

@@ -0,0 +1,435 @@
+"""Docebo API Client for handling authentication and paginated requests."""
+from typing import Any, Dict, Iterator, Optional
+from ingestr.src.docebo.helpers import normalize_docebo_dates
+from ingestr.src.http_client import create_client
+class DoceboClient:
+    """Client for interacting with Docebo LMS API."""
+    def __init__(
+        self,
+        base_url: str,
+        client_id: str,
+        client_secret: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+    ):
+        """
+        Initialize Docebo API client.
+        Args:
+            base_url: The base URL of your Docebo instance
+            client_id: OAuth2 client ID
+            client_secret: OAuth2 client secret
+            username: Optional username for password grant type
+            password: Optional password for password grant type
+        """
+        self.base_url = base_url.rstrip("/")
+        self.client_id = client_id
+        self.client_secret = client_secret
+        self.username = username
+        self.password = password
+        self._access_token = None
+        # Use shared HTTP client with retry logic
+        self.client = create_client(retry_status_codes=[429, 500, 502, 503, 504])
+    def get_access_token(self) -> str:
+        """
+        Get or refresh OAuth2 access token.
+        Returns:
+            Access token string
+        Raises:
+            Exception: If authentication fails
+        """
+        if self._access_token:
+            return self._access_token
+        auth_endpoint = f"{self.base_url}/oauth2/token"
+        # Use client_credentials grant type if no username/password provided
+        if not self.username or not self.password:
+            data = {
+                "client_id": self.client_id,
+                "client_secret": self.client_secret,
+                "grant_type": "client_credentials",
+                "scope": "api",
+            }
+        else:
+            data = {
+                "client_id": self.client_id,
+                "client_secret": self.client_secret,
+                "username": self.username,
+                "password": self.password,
+                "grant_type": "password",
+                "scope": "api",
+            }
+        response = self.client.post(url=auth_endpoint, data=data)
+        response.raise_for_status()
+        token_data = response.json()
+        self._access_token = token_data.get("access_token")
+        if not self._access_token:
+            raise Exception("Failed to obtain access token from Docebo")
+        return self._access_token
+    def get_paginated_data(
+        self,
+        endpoint: str,
+        page_size: int = 200,
+        params: Optional[Dict[str, Any]] = None,
+    ) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch paginated data from a Docebo API endpoint.
+        Args:
+            endpoint: API endpoint path (e.g., "manage/v1/user")
+            page_size: Number of items per page
+            params: Additional query parameters
+        Yields:
+            Batches of items from the API
+        """
+        url = f"{self.base_url}/{endpoint}"
+        headers = {"authorization": f"Bearer {self.get_access_token()}"}
+        page = 1
+        has_more_data = True
+        while has_more_data:
+            request_params = {"page": page, "page_size": page_size}
+            if params:
+                request_params.update(params)
+            response = self.client.get(url=url, headers=headers, params=request_params)
+            response.raise_for_status()
+            data = response.json()
+            # Handle paginated response structure
+            if "data" in data:
+                # Most Docebo endpoints return data in this structure
+                if "items" in data["data"]:
+                    items = data["data"]["items"]
+                    if items:
+                        # Normalize dates for each item before yielding
+                        normalized_items = [
+                            normalize_docebo_dates(item) for item in items
+                        ]
+                        yield normalized_items
+                    # Check for more pages
+                    has_more_data = data["data"].get("has_more_data", False)
+                    if has_more_data and "total_page_count" in data["data"]:
+                        total_pages = data["data"]["total_page_count"]
+                        if page >= total_pages:
+                            has_more_data = False
+                # Some endpoints might return data directly as a list
+                elif isinstance(data["data"], list):
+                    items = data["data"]
+                    if items:
+                        # Normalize dates for each item before yielding
+                        normalized_items = [
+                            normalize_docebo_dates(item) for item in items
+                        ]
+                        yield normalized_items
+                    # For direct list responses, check if we got a full page
+                    has_more_data = len(items) == page_size
+                else:
+                    has_more_data = False
+            # Some endpoints might return items directly
+            elif isinstance(data, list):
+                if data:
+                    # Normalize dates for each item before yielding
+                    normalized_items = [normalize_docebo_dates(item) for item in data]
+                    yield normalized_items
+                has_more_data = len(data) == page_size
+            else:
+                has_more_data = False
+            page += 1
+    def fetch_users(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all users from Docebo.
+        Yields:
+            Batches of user data
+        """
+        yield from self.get_paginated_data("manage/v1/user")
+    def fetch_courses(self, page_size: int = 200) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all courses from Docebo.
+        Yields:
+            Batches of course data
+        """
+        yield from self.get_paginated_data("learn/v1/courses", page_size=page_size)
+    # Phase 1: Core User and Organization Resources
+    def fetch_user_fields(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all user fields from Docebo.
+        Yields:
+            Batches of user field definitions
+        """
+        yield from self.get_paginated_data("manage/v1/user_fields")
+    def fetch_branches(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all branches/organizational units from Docebo.
+        Yields:
+            Batches of branch/org chart data
+        """
+        yield from self.get_paginated_data("manage/v1/orgchart")
+    # Phase 2: Group Management
+    def fetch_groups(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all groups/audiences from Docebo.
+        Yields:
+            Batches of group data
+        """
+        yield from self.get_paginated_data("audiences/v1/audience")
+    def fetch_all_group_members(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all group members for all groups.
+        Yields:
+            Batches of group member data with group_id included
+        """
+        # First fetch all groups
+        all_groups: list[Dict[str, Any]] = []
+        for group_batch in self.fetch_groups():
+            all_groups.extend(group_batch)
+        # Then fetch members for each group
+        for group in all_groups:
+            group_id = (
+                group.get("group_id") or group.get("audience_id") or group.get("id")
+            )
+            if group_id:
+                try:
+                    for member_batch in self.get_paginated_data(
+                        f"manage/v1/group/{group_id}/members"
+                    ):
+                        # Add group_id to each member record
+                        for member in member_batch:
+                            member["group_id"] = group_id
+                        yield member_batch
+                except Exception as e:
+                    print(f"Error fetching members for group {group_id}: {e}")
+                    continue
+    # Phase 3: Advanced Course Resources
+    def fetch_course_fields(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all course field definitions from Docebo.
+        Yields:
+            Batches of course field data
+        """
+        yield from self.get_paginated_data("learn/v1/courses/field")
+    def fetch_all_course_learning_objects(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch learning objects for all courses.
+        Yields:
+            Batches of learning object data
+        """
+        # First fetch all courses
+        all_courses: list[Dict[str, Any]] = []
+        for course_batch in self.fetch_courses():
+            all_courses.extend(course_batch)
+        # Then fetch learning objects for each course
+        for course in all_courses:
+            course_id = course.get("id_course") or course.get("course_id")
+            if course_id:
+                try:
+                    endpoint = f"learn/v1/courses/{course_id}/los"
+                    for lo_batch in self.get_paginated_data(endpoint):
+                        # Add course_id to each learning object
+                        for lo in lo_batch:
+                            if "course_id" not in lo:
+                                lo["course_id"] = course_id
+                        yield lo_batch
+                except Exception as e:
+                    print(
+                        f"Error fetching learning objects for course {course_id}: {e}"
+                    )
+                    continue
+    # Phase 4: Learning Plans
+    def fetch_learning_plans(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all learning plans from Docebo.
+        Yields:
+            Batches of learning plan data
+        """
+        yield from self.get_paginated_data("learningplan/v1/learningplans")
+    def fetch_learning_plan_enrollments(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all learning plan enrollments.
+        Yields:
+            Batches of learning plan enrollment data
+        """
+        yield from self.get_paginated_data(
+            "learningplan/v1/learningplans/enrollments",
+            params={"extra_fields[]": "enrollment_status"},
+        )
+    def fetch_all_learning_plan_course_enrollments(
+        self,
+    ) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch course enrollments for all learning plans.
+        Yields:
+            Batches of learning plan course enrollment data
+        """
+        # First fetch all learning plans
+        all_plans: list[Dict[str, Any]] = []
+        for plan_batch in self.fetch_learning_plans():
+            all_plans.extend(plan_batch)
+        # Then fetch course enrollments for each learning plan
+        for plan in all_plans:
+            plan_id = (
+                plan.get("id_path") or plan.get("learning_plan_id") or plan.get("id")
+            )
+            if plan_id:
+                try:
+                    endpoint = (
+                        f"learningplan/v1/learningplans/{plan_id}/courses/enrollments"
+                    )
+                    for enrollment_batch in self.get_paginated_data(
+                        endpoint, params={"enrollment_level[]": "student"}
+                    ):
+                        # Add learning_plan_id to each enrollment
+                        for enrollment in enrollment_batch:
+                            enrollment["learning_plan_id"] = plan_id
+                        yield enrollment_batch
+                except Exception as e:
+                    print(
+                        f"Error fetching course enrollments for learning plan {plan_id}: {e}"
+                    )
+                    continue
+    # Phase 5: Enrollments and Surveys
+    def fetch_all_course_enrollments(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch enrollments for all courses.
+        Yields:
+            Batches of course enrollment data
+        """
+        # First fetch all courses
+        all_courses: list[Dict[str, Any]] = []
+        for course_batch in self.fetch_courses():
+            all_courses.extend(course_batch)
+        # Then fetch enrollments for each course
+        for course in all_courses:
+            course_id = course.get("id_course") or course.get("course_id")
+            if course_id:
+                try:
+                    endpoint = f"course/v1/courses/{course_id}/enrollments"
+                    for enrollment_batch in self.get_paginated_data(
+                        endpoint, params={"level[]": "3"}
+                    ):
+                        # Add course_id to each enrollment
+                        for enrollment in enrollment_batch:
+                            enrollment["course_id"] = course_id
+                        yield enrollment_batch
+                except Exception as e:
+                    print(f"Error fetching enrollments for course {course_id}: {e}")
+                    continue
+    # Additional Resources
+    def fetch_sessions(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all ILT/classroom sessions for all courses.
+        Yields:
+            Batches of session data
+        """
+        # First fetch all courses
+        all_courses: list[Dict[str, Any]] = []
+        for course_batch in self.fetch_courses():
+            all_courses.extend(course_batch)
+        # Then fetch sessions for each course
+        for course in all_courses:
+            course_id = course.get("id_course") or course.get("course_id")
+            if course_id:
+                try:
+                    endpoint = f"learn/v1/courses/{course_id}/sessions"
+                    for session_batch in self.get_paginated_data(endpoint):
+                        # Add course_id to each session
+                        for session in session_batch:
+                            session["course_id"] = course_id
+                        yield session_batch
+                except Exception:
+                    # Many courses may not have sessions, so just continue
+                    continue
+    def fetch_categories(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all course categories.
+        Yields:
+            Batches of category data
+        """
+        yield from self.get_paginated_data("learn/v1/categories")
+    def fetch_certifications(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all certifications.
+        Yields:
+            Batches of certification data
+        """
+        yield from self.get_paginated_data("learn/v1/certification")
+    def fetch_external_training(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all external training records.
+        Yields:
+            Batches of external training data
+        """
+        yield from self.get_paginated_data("learn/v1/external_training")
+    def fetch_survey_answers_for_poll(
+        self, poll_id: int, course_id: int
+    ) -> Dict[str, Any]:
+        """
+        Fetch survey answers for a specific poll.
+        Args:
+            poll_id: The poll/survey ID
+            course_id: The course ID containing the poll
+        Returns:
+            Survey answer data or empty dict if no answers
+        """
+        url = f"{self.base_url}/learn/v1/survey/{poll_id}/answer"
+        headers = {"authorization": f"Bearer {self.get_access_token()}"}
+        params = {"id_course": course_id}
+        response = self.client.get(url, headers=headers, params=params)
+        return normalize_docebo_dates(response.json().get("data", {}))

ingestr/src/docebo/helpers.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Helper functions for Docebo API data processing."""
+from datetime import datetime
+from typing import Any, Dict, Union
+def normalize_date_field(date_value: Any) -> Union[datetime, str, None]:
+    """
+    Normalize a single date field that may contain invalid dates.
+    Args:
+        date_value: The date value to normalize (string, datetime, or None)
+    Returns:
+        Normalized datetime object or None for invalid/empty dates
+    """
+    # Unix epoch datetime (1970-01-01 00:00:00 UTC)
+    epoch_datetime = datetime(1970, 1, 1)
+    # Handle string dates
+    if isinstance(date_value, str):
+        # Handle '0000-00-00' or '0000-00-00 00:00:00'
+        if date_value.startswith("0000-00-00"):
+            return epoch_datetime
+        # Handle other invalid date formats
+        elif date_value in ["", "0", "null", "NULL"]:
+            return None
+        # Try to parse valid date strings
+        else:
+            try:
+                # Try common date formats
+                for fmt in [
+                    "%Y-%m-%d %H:%M:%S",
+                    "%Y-%m-%d",
+                    "%Y/%m/%d %H:%M:%S",
+                    "%Y/%m/%d",
+                ]:
+                    try:
+                        return datetime.strptime(date_value, fmt)
+                    except ValueError:
+                        continue
+                # If no format matches, return the original string
+                return date_value
+            except Exception:
+                return date_value
+    # Handle datetime objects - pass through
+    elif isinstance(date_value, datetime):
+        return date_value
+    # Handle cases where the field might be None or empty
+    elif not date_value:
+        return None
+    # Return the original value for other types
+    return date_value
+def normalize_docebo_dates(item: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Normalize Docebo date fields that contain '0000-00-00' to Unix epoch (1970-01-01).
+    Args:
+        item: Dictionary containing data from Docebo API
+    Returns:
+        Dictionary with normalized date fields
+    """
+    # Date fields that might contain '0000-00-00'
+    # Add more fields as needed for different resources
+    date_fields = [
+        "last_access_date",
+        "last_update",
+        "creation_date",
+        "date_begin",  # Course field
+        "date_end",  # Course field
+        "date_publish",  # Course field
+        "date_unpublish",  # Course field
+        "enrollment_date",  # Enrollment field
+        "completion_date",  # Enrollment field
+        "date_assigned",  # Assignment field
+        "date_completed",  # Completion field
+        "survey_date",  # Survey field
+        "start_date",  # Course/Plan field
+        "end_date",  # Course/Plan field
+        "date_created",  # Generic creation date
+        "created_on",  # Learning plan field
+        "updated_on",  # Learning plan field
+        "date_modified",  # Generic modification date
+        "expire_date",  # Expiration date
+        "date_last_updated",  # Update date
+        "date",  # Generic date field (used in survey answers)
+    ]
+    for field in date_fields:
+        if field in item:
+            item[field] = normalize_date_field(item[field])
+    return item

ingestr/src/elasticsearch/__init__.py ADDED Viewed

@@ -0,0 +1,80 @@
+from datetime import date, datetime
+from typing import Any, Optional
+import dlt
+import pendulum
+from dlt.common.time import ensure_pendulum_datetime
+from pendulum import parse
+from elasticsearch import Elasticsearch
+@dlt.source
+def elasticsearch_source(
+    connection_url: str,
+    index: str,
+    verify_certs: bool,
+    incremental: Optional[dlt.sources.incremental] = None,
+):
+    client = Elasticsearch(connection_url, verify_certs=verify_certs)
+    @dlt.resource(
+        name=index, primary_key="id", write_disposition="merge", incremental=incremental
+    )
+    def get_documents(incremental=incremental):
+        body = {"query": {"match_all": {}}}
+        if incremental:
+            start_value = incremental.last_value
+            range_filter = {"gte": start_value}
+            if incremental.end_value is not None:
+                range_filter["lt"] = incremental.end_value
+            body = {"query": {"range": {incremental.cursor_path: range_filter}}}
+        page = client.search(index=index, scroll="5m", size=5, body=body)
+        sid = page["_scroll_id"]
+        hits = page["hits"]["hits"]
+        if not hits:
+            return
+        # fetching first page (via .search)
+        for doc in hits:
+            doc_data = {"id": doc["_id"], **doc["_source"]}
+            if incremental:
+                doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
+                    doc_data[incremental.cursor_path]
+                )
+            yield doc_data
+        while True:
+            # fetching page 2 and other pages (via .scroll)
+            page = client.scroll(scroll_id=sid, scroll="5m")
+            sid = page["_scroll_id"]
+            hits = page["hits"]["hits"]
+            if not hits:
+                break
+            for doc in hits:
+                doc_data = {"id": doc["_id"], **doc["_source"]}
+                if incremental:
+                    doc_data[incremental.cursor_path] = convert_elasticsearch_objs(
+                        doc_data[incremental.cursor_path]
+                    )
+                yield doc_data
+        client.clear_scroll(scroll_id=sid)
+    return get_documents
+def convert_elasticsearch_objs(value: Any) -> Any:
+    if isinstance(value, str):
+        parsed_date = parse(value, strict=False)
+        if parsed_date is not None:
+            if isinstance(
+                parsed_date,
+                (pendulum.DateTime, pendulum.Date, datetime, date, str, float, int),
+            ):
+                return ensure_pendulum_datetime(parsed_date)
+    return value

ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

ingestr 0.13.2py3-none-any.whl → 0.14.104py3-none-any.whl