PyPI - ingestr - Versions diffs - 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl - Mend

ingestr 0.13.75py3-none-any.whl → 0.14.98py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (79) hide show

ingestr/main.py +22 -3
ingestr/src/adjust/__init__.py +4 -4
ingestr/src/allium/__init__.py +128 -0
ingestr/src/anthropic/__init__.py +277 -0
ingestr/src/anthropic/helpers.py +525 -0
ingestr/src/appstore/__init__.py +1 -0
ingestr/src/asana_source/__init__.py +1 -1
ingestr/src/buildinfo.py +1 -1
ingestr/src/chess/__init__.py +1 -1
ingestr/src/couchbase_source/__init__.py +118 -0
ingestr/src/couchbase_source/helpers.py +135 -0
ingestr/src/cursor/__init__.py +83 -0
ingestr/src/cursor/helpers.py +188 -0
ingestr/src/destinations.py +169 -1
ingestr/src/docebo/__init__.py +589 -0
ingestr/src/docebo/client.py +435 -0
ingestr/src/docebo/helpers.py +97 -0
ingestr/src/elasticsearch/helpers.py +138 -0
ingestr/src/errors.py +8 -0
ingestr/src/facebook_ads/__init__.py +26 -23
ingestr/src/facebook_ads/helpers.py +47 -1
ingestr/src/factory.py +48 -0
ingestr/src/filesystem/__init__.py +8 -3
ingestr/src/filters.py +9 -0
ingestr/src/fluxx/__init__.py +9906 -0
ingestr/src/fluxx/helpers.py +209 -0
ingestr/src/frankfurter/__init__.py +157 -163
ingestr/src/frankfurter/helpers.py +3 -3
ingestr/src/freshdesk/__init__.py +25 -8
ingestr/src/freshdesk/freshdesk_client.py +40 -5
ingestr/src/fundraiseup/__init__.py +49 -0
ingestr/src/fundraiseup/client.py +81 -0
ingestr/src/github/__init__.py +6 -4
ingestr/src/google_analytics/__init__.py +1 -1
ingestr/src/hostaway/__init__.py +302 -0
ingestr/src/hostaway/client.py +288 -0
ingestr/src/http/__init__.py +35 -0
ingestr/src/http/readers.py +114 -0
ingestr/src/hubspot/__init__.py +6 -12
ingestr/src/influxdb/__init__.py +1 -0
ingestr/src/intercom/__init__.py +142 -0
ingestr/src/intercom/helpers.py +674 -0
ingestr/src/intercom/settings.py +279 -0
ingestr/src/jira_source/__init__.py +340 -0
ingestr/src/jira_source/helpers.py +439 -0
ingestr/src/jira_source/settings.py +170 -0
ingestr/src/klaviyo/__init__.py +5 -5
ingestr/src/linear/__init__.py +553 -116
ingestr/src/linear/helpers.py +77 -38
ingestr/src/mailchimp/__init__.py +126 -0
ingestr/src/mailchimp/helpers.py +226 -0
ingestr/src/mailchimp/settings.py +164 -0
ingestr/src/masking.py +344 -0
ingestr/src/monday/__init__.py +246 -0
ingestr/src/monday/helpers.py +392 -0
ingestr/src/monday/settings.py +328 -0
ingestr/src/mongodb/__init__.py +5 -2
ingestr/src/mongodb/helpers.py +384 -10
ingestr/src/plusvibeai/__init__.py +335 -0
ingestr/src/plusvibeai/helpers.py +544 -0
ingestr/src/plusvibeai/settings.py +252 -0
ingestr/src/revenuecat/__init__.py +83 -0
ingestr/src/revenuecat/helpers.py +237 -0
ingestr/src/salesforce/__init__.py +15 -8
ingestr/src/shopify/__init__.py +1 -1
ingestr/src/smartsheets/__init__.py +33 -5
ingestr/src/socrata_source/__init__.py +83 -0
ingestr/src/socrata_source/helpers.py +85 -0
ingestr/src/socrata_source/settings.py +8 -0
ingestr/src/sources.py +1418 -54
ingestr/src/stripe_analytics/__init__.py +2 -19
ingestr/src/wise/__init__.py +68 -0
ingestr/src/wise/client.py +63 -0
ingestr/tests/unit/test_smartsheets.py +6 -9
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/docebo/client.py ADDED Viewed

@@ -0,0 +1,435 @@
+"""Docebo API Client for handling authentication and paginated requests."""
+from typing import Any, Dict, Iterator, Optional
+from ingestr.src.docebo.helpers import normalize_docebo_dates
+from ingestr.src.http_client import create_client
+class DoceboClient:
+    """Client for interacting with Docebo LMS API."""
+    def __init__(
+        self,
+        base_url: str,
+        client_id: str,
+        client_secret: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+    ):
+        """
+        Initialize Docebo API client.
+        Args:
+            base_url: The base URL of your Docebo instance
+            client_id: OAuth2 client ID
+            client_secret: OAuth2 client secret
+            username: Optional username for password grant type
+            password: Optional password for password grant type
+        """
+        self.base_url = base_url.rstrip("/")
+        self.client_id = client_id
+        self.client_secret = client_secret
+        self.username = username
+        self.password = password
+        self._access_token = None
+        # Use shared HTTP client with retry logic
+        self.client = create_client(retry_status_codes=[429, 500, 502, 503, 504])
+    def get_access_token(self) -> str:
+        """
+        Get or refresh OAuth2 access token.
+        Returns:
+            Access token string
+        Raises:
+            Exception: If authentication fails
+        """
+        if self._access_token:
+            return self._access_token
+        auth_endpoint = f"{self.base_url}/oauth2/token"
+        # Use client_credentials grant type if no username/password provided
+        if not self.username or not self.password:
+            data = {
+                "client_id": self.client_id,
+                "client_secret": self.client_secret,
+                "grant_type": "client_credentials",
+                "scope": "api",
+            }
+        else:
+            data = {
+                "client_id": self.client_id,
+                "client_secret": self.client_secret,
+                "username": self.username,
+                "password": self.password,
+                "grant_type": "password",
+                "scope": "api",
+            }
+        response = self.client.post(url=auth_endpoint, data=data)
+        response.raise_for_status()
+        token_data = response.json()
+        self._access_token = token_data.get("access_token")
+        if not self._access_token:
+            raise Exception("Failed to obtain access token from Docebo")
+        return self._access_token
+    def get_paginated_data(
+        self,
+        endpoint: str,
+        page_size: int = 200,
+        params: Optional[Dict[str, Any]] = None,
+    ) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch paginated data from a Docebo API endpoint.
+        Args:
+            endpoint: API endpoint path (e.g., "manage/v1/user")
+            page_size: Number of items per page
+            params: Additional query parameters
+        Yields:
+            Batches of items from the API
+        """
+        url = f"{self.base_url}/{endpoint}"
+        headers = {"authorization": f"Bearer {self.get_access_token()}"}
+        page = 1
+        has_more_data = True
+        while has_more_data:
+            request_params = {"page": page, "page_size": page_size}
+            if params:
+                request_params.update(params)
+            response = self.client.get(url=url, headers=headers, params=request_params)
+            response.raise_for_status()
+            data = response.json()
+            # Handle paginated response structure
+            if "data" in data:
+                # Most Docebo endpoints return data in this structure
+                if "items" in data["data"]:
+                    items = data["data"]["items"]
+                    if items:
+                        # Normalize dates for each item before yielding
+                        normalized_items = [
+                            normalize_docebo_dates(item) for item in items
+                        ]
+                        yield normalized_items
+                    # Check for more pages
+                    has_more_data = data["data"].get("has_more_data", False)
+                    if has_more_data and "total_page_count" in data["data"]:
+                        total_pages = data["data"]["total_page_count"]
+                        if page >= total_pages:
+                            has_more_data = False
+                # Some endpoints might return data directly as a list
+                elif isinstance(data["data"], list):
+                    items = data["data"]
+                    if items:
+                        # Normalize dates for each item before yielding
+                        normalized_items = [
+                            normalize_docebo_dates(item) for item in items
+                        ]
+                        yield normalized_items
+                    # For direct list responses, check if we got a full page
+                    has_more_data = len(items) == page_size
+                else:
+                    has_more_data = False
+            # Some endpoints might return items directly
+            elif isinstance(data, list):
+                if data:
+                    # Normalize dates for each item before yielding
+                    normalized_items = [normalize_docebo_dates(item) for item in data]
+                    yield normalized_items
+                has_more_data = len(data) == page_size
+            else:
+                has_more_data = False
+            page += 1
+    def fetch_users(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all users from Docebo.
+        Yields:
+            Batches of user data
+        """
+        yield from self.get_paginated_data("manage/v1/user")
+    def fetch_courses(self, page_size: int = 200) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all courses from Docebo.
+        Yields:
+            Batches of course data
+        """
+        yield from self.get_paginated_data("learn/v1/courses", page_size=page_size)
+    # Phase 1: Core User and Organization Resources
+    def fetch_user_fields(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all user fields from Docebo.
+        Yields:
+            Batches of user field definitions
+        """
+        yield from self.get_paginated_data("manage/v1/user_fields")
+    def fetch_branches(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all branches/organizational units from Docebo.
+        Yields:
+            Batches of branch/org chart data
+        """
+        yield from self.get_paginated_data("manage/v1/orgchart")
+    # Phase 2: Group Management
+    def fetch_groups(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all groups/audiences from Docebo.
+        Yields:
+            Batches of group data
+        """
+        yield from self.get_paginated_data("audiences/v1/audience")
+    def fetch_all_group_members(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all group members for all groups.
+        Yields:
+            Batches of group member data with group_id included
+        """
+        # First fetch all groups
+        all_groups: list[Dict[str, Any]] = []
+        for group_batch in self.fetch_groups():
+            all_groups.extend(group_batch)
+        # Then fetch members for each group
+        for group in all_groups:
+            group_id = (
+                group.get("group_id") or group.get("audience_id") or group.get("id")
+            )
+            if group_id:
+                try:
+                    for member_batch in self.get_paginated_data(
+                        f"manage/v1/group/{group_id}/members"
+                    ):
+                        # Add group_id to each member record
+                        for member in member_batch:
+                            member["group_id"] = group_id
+                        yield member_batch
+                except Exception as e:
+                    print(f"Error fetching members for group {group_id}: {e}")
+                    continue
+    # Phase 3: Advanced Course Resources
+    def fetch_course_fields(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all course field definitions from Docebo.
+        Yields:
+            Batches of course field data
+        """
+        yield from self.get_paginated_data("learn/v1/courses/field")
+    def fetch_all_course_learning_objects(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch learning objects for all courses.
+        Yields:
+            Batches of learning object data
+        """
+        # First fetch all courses
+        all_courses: list[Dict[str, Any]] = []
+        for course_batch in self.fetch_courses():
+            all_courses.extend(course_batch)
+        # Then fetch learning objects for each course
+        for course in all_courses:
+            course_id = course.get("id_course") or course.get("course_id")
+            if course_id:
+                try:
+                    endpoint = f"learn/v1/courses/{course_id}/los"
+                    for lo_batch in self.get_paginated_data(endpoint):
+                        # Add course_id to each learning object
+                        for lo in lo_batch:
+                            if "course_id" not in lo:
+                                lo["course_id"] = course_id
+                        yield lo_batch
+                except Exception as e:
+                    print(
+                        f"Error fetching learning objects for course {course_id}: {e}"
+                    )
+                    continue
+    # Phase 4: Learning Plans
+    def fetch_learning_plans(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all learning plans from Docebo.
+        Yields:
+            Batches of learning plan data
+        """
+        yield from self.get_paginated_data("learningplan/v1/learningplans")
+    def fetch_learning_plan_enrollments(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all learning plan enrollments.
+        Yields:
+            Batches of learning plan enrollment data
+        """
+        yield from self.get_paginated_data(
+            "learningplan/v1/learningplans/enrollments",
+            params={"extra_fields[]": "enrollment_status"},
+        )
+    def fetch_all_learning_plan_course_enrollments(
+        self,
+    ) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch course enrollments for all learning plans.
+        Yields:
+            Batches of learning plan course enrollment data
+        """
+        # First fetch all learning plans
+        all_plans: list[Dict[str, Any]] = []
+        for plan_batch in self.fetch_learning_plans():
+            all_plans.extend(plan_batch)
+        # Then fetch course enrollments for each learning plan
+        for plan in all_plans:
+            plan_id = (
+                plan.get("id_path") or plan.get("learning_plan_id") or plan.get("id")
+            )
+            if plan_id:
+                try:
+                    endpoint = (
+                        f"learningplan/v1/learningplans/{plan_id}/courses/enrollments"
+                    )
+                    for enrollment_batch in self.get_paginated_data(
+                        endpoint, params={"enrollment_level[]": "student"}
+                    ):
+                        # Add learning_plan_id to each enrollment
+                        for enrollment in enrollment_batch:
+                            enrollment["learning_plan_id"] = plan_id
+                        yield enrollment_batch
+                except Exception as e:
+                    print(
+                        f"Error fetching course enrollments for learning plan {plan_id}: {e}"
+                    )
+                    continue
+    # Phase 5: Enrollments and Surveys
+    def fetch_all_course_enrollments(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch enrollments for all courses.
+        Yields:
+            Batches of course enrollment data
+        """
+        # First fetch all courses
+        all_courses: list[Dict[str, Any]] = []
+        for course_batch in self.fetch_courses():
+            all_courses.extend(course_batch)
+        # Then fetch enrollments for each course
+        for course in all_courses:
+            course_id = course.get("id_course") or course.get("course_id")
+            if course_id:
+                try:
+                    endpoint = f"course/v1/courses/{course_id}/enrollments"
+                    for enrollment_batch in self.get_paginated_data(
+                        endpoint, params={"level[]": "3"}
+                    ):
+                        # Add course_id to each enrollment
+                        for enrollment in enrollment_batch:
+                            enrollment["course_id"] = course_id
+                        yield enrollment_batch
+                except Exception as e:
+                    print(f"Error fetching enrollments for course {course_id}: {e}")
+                    continue
+    # Additional Resources
+    def fetch_sessions(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all ILT/classroom sessions for all courses.
+        Yields:
+            Batches of session data
+        """
+        # First fetch all courses
+        all_courses: list[Dict[str, Any]] = []
+        for course_batch in self.fetch_courses():
+            all_courses.extend(course_batch)
+        # Then fetch sessions for each course
+        for course in all_courses:
+            course_id = course.get("id_course") or course.get("course_id")
+            if course_id:
+                try:
+                    endpoint = f"learn/v1/courses/{course_id}/sessions"
+                    for session_batch in self.get_paginated_data(endpoint):
+                        # Add course_id to each session
+                        for session in session_batch:
+                            session["course_id"] = course_id
+                        yield session_batch
+                except Exception:
+                    # Many courses may not have sessions, so just continue
+                    continue
+    def fetch_categories(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all course categories.
+        Yields:
+            Batches of category data
+        """
+        yield from self.get_paginated_data("learn/v1/categories")
+    def fetch_certifications(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all certifications.
+        Yields:
+            Batches of certification data
+        """
+        yield from self.get_paginated_data("learn/v1/certification")
+    def fetch_external_training(self) -> Iterator[list[Dict[str, Any]]]:
+        """
+        Fetch all external training records.
+        Yields:
+            Batches of external training data
+        """
+        yield from self.get_paginated_data("learn/v1/external_training")
+    def fetch_survey_answers_for_poll(
+        self, poll_id: int, course_id: int
+    ) -> Dict[str, Any]:
+        """
+        Fetch survey answers for a specific poll.
+        Args:
+            poll_id: The poll/survey ID
+            course_id: The course ID containing the poll
+        Returns:
+            Survey answer data or empty dict if no answers
+        """
+        url = f"{self.base_url}/learn/v1/survey/{poll_id}/answer"
+        headers = {"authorization": f"Bearer {self.get_access_token()}"}
+        params = {"id_course": course_id}
+        response = self.client.get(url, headers=headers, params=params)
+        return normalize_docebo_dates(response.json().get("data", {}))

ingestr/src/docebo/helpers.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Helper functions for Docebo API data processing."""
+from datetime import datetime
+from typing import Any, Dict, Union
+def normalize_date_field(date_value: Any) -> Union[datetime, str, None]:
+    """
+    Normalize a single date field that may contain invalid dates.
+    Args:
+        date_value: The date value to normalize (string, datetime, or None)
+    Returns:
+        Normalized datetime object or None for invalid/empty dates
+    """
+    # Unix epoch datetime (1970-01-01 00:00:00 UTC)
+    epoch_datetime = datetime(1970, 1, 1)
+    # Handle string dates
+    if isinstance(date_value, str):
+        # Handle '0000-00-00' or '0000-00-00 00:00:00'
+        if date_value.startswith("0000-00-00"):
+            return epoch_datetime
+        # Handle other invalid date formats
+        elif date_value in ["", "0", "null", "NULL"]:
+            return None
+        # Try to parse valid date strings
+        else:
+            try:
+                # Try common date formats
+                for fmt in [
+                    "%Y-%m-%d %H:%M:%S",
+                    "%Y-%m-%d",
+                    "%Y/%m/%d %H:%M:%S",
+                    "%Y/%m/%d",
+                ]:
+                    try:
+                        return datetime.strptime(date_value, fmt)
+                    except ValueError:
+                        continue
+                # If no format matches, return the original string
+                return date_value
+            except Exception:
+                return date_value
+    # Handle datetime objects - pass through
+    elif isinstance(date_value, datetime):
+        return date_value
+    # Handle cases where the field might be None or empty
+    elif not date_value:
+        return None
+    # Return the original value for other types
+    return date_value
+def normalize_docebo_dates(item: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Normalize Docebo date fields that contain '0000-00-00' to Unix epoch (1970-01-01).
+    Args:
+        item: Dictionary containing data from Docebo API
+    Returns:
+        Dictionary with normalized date fields
+    """
+    # Date fields that might contain '0000-00-00'
+    # Add more fields as needed for different resources
+    date_fields = [
+        "last_access_date",
+        "last_update",
+        "creation_date",
+        "date_begin",  # Course field
+        "date_end",  # Course field
+        "date_publish",  # Course field
+        "date_unpublish",  # Course field
+        "enrollment_date",  # Enrollment field
+        "completion_date",  # Enrollment field
+        "date_assigned",  # Assignment field
+        "date_completed",  # Completion field
+        "survey_date",  # Survey field
+        "start_date",  # Course/Plan field
+        "end_date",  # Course/Plan field
+        "date_created",  # Generic creation date
+        "created_on",  # Learning plan field
+        "updated_on",  # Learning plan field
+        "date_modified",  # Generic modification date
+        "expire_date",  # Expiration date
+        "date_last_updated",  # Update date
+        "date",  # Generic date field (used in survey answers)
+    ]
+    for field in date_fields:
+        if field in item:
+            item[field] = normalize_date_field(item[field])
+    return item

ingestr/src/elasticsearch/helpers.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Elasticsearch destination helpers"""
+import json
+import logging
+from typing import Any, Dict, Iterator
+from urllib.parse import urlparse
+import dlt
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+# Suppress Elasticsearch transport logging
+logging.getLogger("elasticsearch.transport").setLevel(logging.WARNING)
+logging.getLogger("elastic_transport.transport").setLevel(logging.WARNING)
+def process_file_items(file_path: str) -> Iterator[Dict[str, Any]]:
+    """Process items from a file path (JSONL format)."""
+    with open(file_path, "r") as f:
+        for line in f:
+            if line.strip():
+                doc = json.loads(line.strip())
+                # Clean DLT metadata
+                cleaned_doc = {
+                    k: v for k, v in doc.items() if not k.startswith("_dlt_")
+                }
+                yield cleaned_doc
+def process_iterable_items(items: Any) -> Iterator[Dict[str, Any]]:
+    """Process items from an iterable."""
+    for item in items:
+        if isinstance(item, dict):
+            # Clean DLT metadata
+            cleaned_item = {k: v for k, v in item.items() if not k.startswith("_dlt_")}
+            yield cleaned_item
+@dlt.destination(
+    name="elasticsearch",
+    loader_file_format="typed-jsonl",
+    batch_size=1000,
+    naming_convention="snake_case",
+)
+def elasticsearch_insert(
+    items, table, connection_string: str = dlt.secrets.value
+) -> None:
+    """Insert data into Elasticsearch index.
+    Args:
+        items: Data items (file path or iterable)
+        table: Table metadata containing name and schema info
+        connection_string: Elasticsearch connection string
+    """
+    # Parse connection string
+    parsed = urlparse(connection_string)
+    # Build Elasticsearch client configuration
+    actual_url = connection_string
+    secure = True  # Default to HTTPS (secure by default)
+    if connection_string.startswith("elasticsearch://"):
+        actual_url = connection_string.replace("elasticsearch://", "")
+        # Parse to check for query parameters
+        temp_parsed = urlparse("http://" + actual_url)
+        from urllib.parse import parse_qs
+        query_params = parse_qs(temp_parsed.query)
+        # Check ?secure parameter (defaults to true)
+        if "secure" in query_params:
+            secure = query_params["secure"][0].lower() in ["true", "1", "yes"]
+        # Remove query params from URL for ES client
+        actual_url = actual_url.split("?")[0]
+        # Add scheme
+        scheme = "https" if secure else "http"
+        actual_url = f"{scheme}://{actual_url}"
+        parsed = urlparse(actual_url)
+    es_config: Dict[str, Any] = {
+        "hosts": [actual_url],
+        "verify_certs": secure,
+        "ssl_show_warn": False,
+    }
+    # Add authentication if present
+    if parsed.username and parsed.password:
+        es_config["http_auth"] = (parsed.username, parsed.password)
+    # Get index name from table metadata
+    index_name = table["name"]
+    # Connect to Elasticsearch
+    client = Elasticsearch(**es_config)
+    # Replace mode: delete existing index if it exists
+    if client.indices.exists(index=index_name):
+        client.indices.delete(index=index_name)
+    # Process and insert documents
+    if isinstance(items, str):
+        documents = process_file_items(items)
+    else:
+        documents = process_iterable_items(items)
+    # Prepare documents for bulk insert as generator
+    def doc_generator():
+        for doc in documents:
+            es_doc: Dict[str, Any] = {"_index": index_name, "_source": doc.copy()}
+            # Use _id if present, otherwise let ES generate one
+            if "_id" in doc:
+                es_doc["_id"] = str(doc["_id"])
+                # Remove _id from source since it's metadata
+                if "_id" in es_doc["_source"]:
+                    del es_doc["_source"]["_id"]
+            elif "id" in doc:
+                es_doc["_id"] = str(doc["id"])
+            yield es_doc
+    # Bulk insert
+    try:
+        _, failed_items = bulk(client, doc_generator(), request_timeout=60)
+        if failed_items:
+            failed_count = (
+                len(failed_items) if isinstance(failed_items, list) else failed_items
+            )
+            raise Exception(
+                f"Failed to insert {failed_count} documents: {failed_items}"
+            )
+    except Exception as e:
+        raise Exception(f"Elasticsearch bulk insert failed: {str(e)}")

ingestr/src/errors.py CHANGED Viewed

@@ -1,3 +1,6 @@
+import requests
 class MissingValueError(Exception):
     def __init__(self, value, source):
         super().__init__(f"{value} is required to connect to {source}")
@@ -16,3 +19,8 @@ class InvalidBlobTableError(Exception):
             f"Invalid source table for {source} "
             "Ensure that the table is in the format {bucket-name}/{file glob}"
         )
+class HTTPError(Exception):
+    def __init__(self, source: requests.HTTPError):
+        super().__init__(f"HTTP {source.response.status_code}: {source.response.text}")

ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.75py3-none-any.whl → 0.14.98py3-none-any.whl