PyPI - datasourcelib - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

datasourcelib 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

datasourcelib/datasources/azure_devops_source copy.py ADDED Viewed

@@ -0,0 +1,126 @@
+from typing import Any, Dict, List, Optional
+from datasourcelib.datasources.datasource_base import DataSourceBase
+from datasourcelib.utils.logger import get_logger
+from datasourcelib.utils.validators import require_keys
+import base64
+import json
+from bs4 import BeautifulSoup
+logger = get_logger(__name__)
+try:
+    import requests  # type: ignore
+except Exception:
+    requests = None  # lazy import handled at runtime
+class AzureDevOpsSource(DataSourceBase):
+    def validate_config(self) -> bool:
+        try:
+            require_keys(self.config, ["ado_organization", "ado_personal_access_token","ado_project","ado_query_id"])
+            return True
+        except Exception as ex:
+            logger.error("AzureDevOpsSource.validate_config: %s", ex)
+            return False
+    def connect(self) -> bool:
+        if requests is None:
+            raise RuntimeError("requests package is required for AzureDevOpsSource")
+        # No persistent connection; store auth header
+        pat = self.config.get("ado_personal_access_token")
+        token = pat
+        token_b64 = base64.b64encode(token.encode("utf-8")).decode("utf-8")
+        self._headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
+        self._connected = True
+        logger.info("AzureDevOpsSource ready (no persistent connection required)")
+        return True
+    def disconnect(self) -> None:
+        self._headers = {}
+        self._connected = False
+        logger.info("AzureDevOpsSource cleared")
+    def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
+        if requests is None:
+            raise RuntimeError("requests package is required for AzureDevOpsSource")
+        if not getattr(self, "_connected", False):
+            self.connect()
+        org = self.config.get("ado_organization")
+        project = self.config.get("ado_project")
+        query_id = self.config.get("ado_query_id")
+        api_version = self.config.get("api_version", "7.1")
+        #path = self.config.get("query_path", query or "")
+        if not query_id:
+            raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
+        base = f"https://dev.azure.com/{org}/"
+        if project:
+            base = f"{base}{project}/"
+        url = f"{base}_apis/wit/wiql/{query_id}"
+        params = {"api-version": api_version}
+        method = self.config.get("method", "GET").upper()
+        query_response = requests.request(method, url, headers=getattr(self, "_headers", {}), params=params) #, json=self.config.get("payload")
+        query_response.raise_for_status()
+        #data = resp.json()
+        # Check if the request was successful
+        if query_response.status_code == 200:
+            work_items = query_response.json()['workItems']
+            work_item_details = []
+            # Loop through each work item ID to get detailed information
+            for item in work_items:
+                work_item_id = item['id']
+                work_item_url = f'https://dev.azure.com/{org}/{project}/_apis/wit/workitems/{work_item_id}?api-version=7.1'
+                work_item_response = requests.get(work_item_url, headers=getattr(self, "_headers", {}))
+                if work_item_response.status_code == 200:
+                    logger.info(f"Current Item: {work_item_id}")
+                    text = work_item_response.json()['fields']['System.Description']
+                    c_desc=BeautifulSoup(text, "html.parser").get_text()
+                    c_changedate = work_item_response.json()['fields']['System.ChangedDate']
+                    c_title = work_item_response.json()['fields']['System.Title']
+                    c_status = work_item_response.json()['fields']['System.State']
+                    c_type = work_item_response.json()['fields']['System.WorkItemType']
+                    c_created = work_item_response.json()['fields']['System.CreatedDate']
+                    default_value = "-VALUE NOT ASSIGNED-"
+                    c_assigned = work_item_response.json()['fields'].get('System.AssignedTo',{}).get('displayName',default_value)
+                    logger.info(c_assigned)
+                    c_tags = work_item_response.json()['fields'].get('System.Tags',default_value)
+                    c_project = work_item_response.json()['fields'].get('Custom.ProjectName',default_value)
+                    c_rtype = work_item_response.json()['fields'].get('Custom.Releasetype',default_value)
+                    c_rdate = work_item_response.json()['fields'].get('Microsoft.VSTS.Scheduling.TargetDate',default_value)
+                    #fullfeature = f"{c_type} ID {work_item_id} was created on {c_created} for a {c_rtype} release of Project '{c_project}' with target date '{c_rdate}' and has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned} and last modified on {c_changedate}.Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
+                    fullfeature = f"{c_type} ID {work_item_id} was created on {c_created}. {c_type} ID {work_item_id}  is a {c_rtype} release of Project '{c_project}'. {c_type} ID {work_item_id} Release has target date '{c_rdate}'.{c_type} ID {work_item_id} has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned}. {c_type} ID {work_item_id} is last modified on {c_changedate}. Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
+                    # Ensure work_item_details is a list and append a dict for this work item
+                    work_item_details.append({
+                        "id": work_item_id,
+                        "type": c_type,
+                        "title": c_title,
+                        "status": c_status,
+                        "assigned_to": c_assigned,
+                        "created": c_created,
+                        "changed_date": c_changedate,
+                        "tags": c_tags,
+                        "release_type": c_rtype,
+                        "target_date": c_rdate,
+                        "project": c_project,
+                        "description": c_desc,
+                        "full": fullfeature
+                    })
+                else:
+                    logger.error(f"Error fetching details for work item ID {work_item_id}: {work_item_response.status_code}")
+            #work_item_desc = []
+            #for desc in work_item_details:
+            #    work_item_desc.append(desc['fields']['System.Description'])
+            return work_item_details  #[{"response": json.dumps(work_item_details)}]
+        else:
+            raise RuntimeError(f"Error: {query_response.status_code}")
+        # Caller decides how to interpret the payload; default: return raw json in a single-item list

datasourcelib/datasources/azure_devops_source.py CHANGED Viewed

@@ -50,77 +50,135 @@ class AzureDevOpsSource(DataSourceBase):
         project = self.config.get("ado_project")
         query_id = self.config.get("ado_query_id")
         api_version = self.config.get("api_version", "7.1")
-        #path = self.config.get("query_path", query or "")
         if not query_id:
             raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
         base = f"https://dev.azure.com/{org}/"
         if project:
             base = f"{base}{project}/"
-        url = f"{base}_apis/wit/wiql/{query_id}"
+        # WIQL query by id (returns list of work item refs)
+        wiql_url = f"{base}_apis/wit/wiql/{query_id}"
         params = {"api-version": api_version}
         method = self.config.get("method", "GET").upper()
-        query_response = requests.request(method, url, headers=getattr(self, "_headers", {}), params=params) #, json=self.config.get("payload")
+        query_response = requests.request(method, wiql_url, headers=getattr(self, "_headers", {}), params=params)
         query_response.raise_for_status()
-        #data = resp.json()
-        # Check if the request was successful
-        if query_response.status_code == 200:
-            work_items = query_response.json()['workItems']
-            work_item_details = []
-            # Loop through each work item ID to get detailed information
-            for item in work_items:
-                work_item_id = item['id']
-                work_item_url = f'https://dev.azure.com/{org}/{project}/_apis/wit/workitems/{work_item_id}?api-version=7.1'
-                work_item_response = requests.get(work_item_url, headers=getattr(self, "_headers", {}))
-                if work_item_response.status_code == 200:
-                    logger.info(f"Current Item: {work_item_id}")
-                    text = work_item_response.json()['fields']['System.Description']
-                    c_desc=BeautifulSoup(text, "html.parser").get_text()
-                    c_changedate = work_item_response.json()['fields']['System.ChangedDate']
-                    c_title = work_item_response.json()['fields']['System.Title']
-                    c_status = work_item_response.json()['fields']['System.State']
-                    c_type = work_item_response.json()['fields']['System.WorkItemType']
-                    c_created = work_item_response.json()['fields']['System.CreatedDate']
-                    default_value = "-VALUE NOT ASSIGNED-"
-                    c_assigned = work_item_response.json()['fields'].get('System.AssignedTo',{}).get('displayName',default_value)
-                    logger.info(c_assigned)
-                    c_tags = work_item_response.json()['fields'].get('System.Tags',default_value)
-                    c_project = work_item_response.json()['fields'].get('Custom.ProjectName',default_value)
-                    c_rtype = work_item_response.json()['fields'].get('Custom.Releasetype',default_value)
-                    c_rdate = work_item_response.json()['fields'].get('Microsoft.VSTS.Scheduling.TargetDate',default_value)
-                    #fullfeature = f"{c_type} ID {work_item_id} was created on {c_created} for a {c_rtype} release of Project '{c_project}' with target date '{c_rdate}' and has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned} and last modified on {c_changedate}.Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
-                    fullfeature = f"{c_type} ID {work_item_id} was created on {c_created}. {c_type} ID {work_item_id}  is a {c_rtype} release of Project '{c_project}'. {c_type} ID {work_item_id} Release has target date '{c_rdate}'.{c_type} ID {work_item_id} has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned}. {c_type} ID {work_item_id} is last modified on {c_changedate}. Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
-                    # Ensure work_item_details is a list and append a dict for this work item
-                    work_item_details.append({
-                        "id": work_item_id,
-                        "type": c_type,
-                        "title": c_title,
-                        "status": c_status,
-                        "assigned_to": c_assigned,
-                        "created": c_created,
-                        "changed_date": c_changedate,
-                        "tags": c_tags,
-                        "release_type": c_rtype,
-                        "target_date": c_rdate,
-                        "project": c_project,
-                        "description": c_desc,
-                        "full": fullfeature
-                    })
-                else:
-                    logger.error(f"Error fetching details for work item ID {work_item_id}: {work_item_response.status_code}")
-            #work_item_desc = []
-            #for desc in work_item_details:
-            #    work_item_desc.append(desc['fields']['System.Description'])
-            return work_item_details  #[{"response": json.dumps(work_item_details)}]
-        else:
+        if query_response.status_code != 200:
             raise RuntimeError(f"Error: {query_response.status_code}")
-        # Caller decides how to interpret the payload; default: return raw json in a single-item list
+        work_items_refs = query_response.json().get('workItems', []) or []
+        if not work_items_refs:
+            return []
+        # collect ids and fetch details in batch to get all fields for all work item types
+        ids = [str(item.get('id')) for item in work_items_refs if item.get('id')]
+        if not ids:
+            return []
+        details_url = f"https://dev.azure.com/{org}/{project}/_apis/wit/workitems"
+        # expand=all to include fields, relations, and attachments
+        params = {
+            "ids": ",".join(ids),
+            "api-version": api_version,
+            "$expand": "all"
+        }
+        details_resp = requests.get(details_url, headers=getattr(self, "_headers", {}), params=params)
+        details_resp.raise_for_status()
+        items = details_resp.json().get("value", [])
+        work_item_details: List[Dict[str, Any]] = []
+        for item in items:
+            item_id = item.get("id")
+            fields = item.get("fields", {}) or {}
+            # Normalize field keys to safe snake_case-like keys
+            norm_fields: Dict[str, Any] = {}
+            for k, v in fields.items():
+                nk = k.replace(".", "_")
+                nk = nk.lower()
+                norm_fields[nk] = v
+            # Helper to safely extract nested displayName for assigned to
+            assigned = norm_fields.get("system_assignedto")
+            if isinstance(assigned, dict):
+                assigned_to = assigned.get("displayName") or assigned.get("uniqueName") or str(assigned)
+            else:
+                assigned_to = assigned
+            # find a description-like field (some types use different field names)
+            desc = ""
+            for fk in ["system_description", "microsoft_vsts_createdby", "html_description"]:
+                if fk in norm_fields:
+                    desc = norm_fields.get(fk) or ""
+                    break
+            if not desc:
+                # fallback: first field key that contains 'description'
+                for kf, vf in norm_fields.items():
+                    if "description" in kf and vf:
+                        desc = vf
+                        break
+            # clean HTML description to text
+            try:
+                c_desc = BeautifulSoup(desc or "", "html.parser").get_text()
+            except Exception:
+                c_desc = desc or ""
+            # Build common convenience values (use available fields)
+            wi_type = norm_fields.get("system_workitemtype") or norm_fields.get("system_witype") or ""
+            title = norm_fields.get("system_title") or ""
+            status = norm_fields.get("system_state") or ""
+            created = norm_fields.get("system_createddate") or norm_fields.get("system_created") or ""
+            changed = norm_fields.get("system_changeddate") or norm_fields.get("system_changed") or ""
+            tags = norm_fields.get("system_tags", "")
+            project_name = norm_fields.get("custom.projectname") or norm_fields.get("system_teamproject") or ""
+            rtype = norm_fields.get("custom.releasetype") or norm_fields.get("custom_releasetype") or ""
+            target_date = norm_fields.get("microsoft_vsts_scheduling_targetdate") or norm_fields.get("microsoft.vsts.scheduling.targetdate") or ""
+            # Construct a 'full' description string using available pieces
+            parts = []
+            if wi_type:
+                parts.append(f"{wi_type} ID {item_id}")
+            else:
+                parts.append(f"WorkItem {item_id}")
+            if created:
+                parts.append(f"was created on {created}")
+            if title:
+                parts.append(f"and has Title '{title}'")
+            if status:
+                parts.append(f"is currently in {status} state")
+            if assigned_to:
+                parts.append(f"is assigned to {assigned_to}")
+            if project_name:
+                parts.append(f"for Project '{project_name}'")
+            if rtype:
+                parts.append(f"release type '{rtype}'")
+            if target_date:
+                parts.append(f"with target date '{target_date}'")
+            if tags:
+                parts.append(f"Tags: {tags}")
+            if c_desc:
+                parts.append(f"Description: [{c_desc}]")
+            fullfeature = ". ".join(parts)
+            # include all normalized fields in the returned object for completeness
+            entry = {
+                "id": item_id,
+                "type": wi_type,
+                "title": title,
+                "status": status,
+                "assigned_to": assigned_to,
+                "created": created,
+                "changed_date": changed,
+                "tags": tags,
+                "project": project_name,
+                "release_type": rtype,
+                "target_date": target_date,
+                "description": c_desc,
+                "full": fullfeature,
+                "fields": norm_fields  # full field set for this work item
+            }
+            work_item_details.append(entry)
+        return work_item_details

datasourcelib/datasources/sharepoint_source.py CHANGED Viewed

@@ -7,6 +7,7 @@ import requests
 import pandas as pd
 import os
 from uuid import uuid4
+from datetime import datetime, timedelta
 logger = get_logger(__name__)
 reader = ByteReader()
@@ -114,50 +115,86 @@ class SharePointSource(DataSourceBase):
         self._drive_id = drives[0].get("id")
         logger.info("Resolved SharePoint drive ID: %s", self._drive_id)
+    def _get_client_credentials(self) -> Tuple[str, str]:
+        """Retrieve client credentials in order of priority: sp_download_config, sp_client_config, sp_master_config."""
+        # Fallback to sp_client_config
+        sp_client_config = self.config.get("sp_client_config", {})
+        client_id = sp_client_config.get("sp_client_id")
+        client_secret = sp_client_config.get("sp_client_secret")
+        if not client_id or not client_secret:
+            # Fallback to sp_master_config
+            sp_master_config = self.config.get("sp_master_config", {})
+            client_id = client_id or sp_master_config.get("sp_client_id")
+            client_secret = client_secret or sp_master_config.get("sp_client_secret")
+        if not client_id or not client_secret:
+            raise ValueError("Client ID and Client Secret must be provided in the configuration.")
+        return client_id, client_secret
+    def _get_download_credentials(self) -> Tuple[str, str]:
+        """Retrieve client credentials in order of priority: sp_download_config, sp_client_config, sp_master_config."""
+        # Check sp_download_config first
+        sp_download_config = self.config.get("sp_client_config", {}).get("sp_download_config", {})
+        client_id = sp_download_config.get("sp_client_id")
+        client_secret = sp_download_config.get("sp_client_secret")
+        if not client_id or not client_secret:
+            # Fallback to sp_client_config
+            sp_client_config = self.config.get("sp_client_config", {})
+            client_id = client_id or sp_client_config.get("sp_client_id")
+            client_secret = client_secret or sp_client_config.get("sp_client_secret")
+            if not client_id or not client_secret:
+                # Fallback to sp_master_config
+                sp_master_config = self.config.get("sp_master_config", {})
+                client_id = client_id or sp_master_config.get("sp_client_id")
+                client_secret = client_secret or sp_master_config.get("sp_client_secret")
+        if not client_id or not client_secret:
+            raise ValueError("Client ID and Client Secret must be provided in the configuration.")
+        return client_id, client_secret
     def connect(self) -> bool:
         try:
             # basic values
             self._site_url = self.config["sp_site_url"]
-            client_config = self.config["sp_client_config"]
             master_config = self.config["sp_master_config"]
             # get master token (Sites.Read.All)
             try:
-                self._master_token = self._get_token(
-                    master_config["sp_client_id"], master_config["sp_client_secret"], master_config["sp_tenant_id"]
-                )
+                master_client_id = master_config["sp_client_id"]
+                master_client_secret = master_config["sp_client_secret"]
+                self._master_token = self._get_token(master_client_id, master_client_secret, master_config["sp_tenant_id"])
                 logger.info("$$$ - Obtained master access token for SharePoint - $$$")
             except Exception as ex:
                 logger.info("$$$ - Failed to obtain master token - $$$")
             # resolve site and drive ids
             try:
-                self._resolve_site_and_drive(
-                    self.config['sp_site_display_name']
-                )
+                self._resolve_site_and_drive(self.config['sp_site_display_name'])
             except Exception:
                 logger.info("$$$ - Failed to resolve site/drive - $$$")
             # get client token (Site.Selected) for download operations
             try:
-                # use master tenant id for tenant
-                self._access_token = self._get_token(
-                    client_config["sp_client_id"], client_config["sp_client_secret"], master_config["sp_tenant_id"]
-                )
+                client_id, client_secret = self._get_client_credentials()
+                self._access_token = self._get_token(client_id, client_secret, master_config["sp_tenant_id"])
                 logger.info("$$$ - Obtained client access token for SharePoint downloads - $$$")
             except Exception:
                 logger.info("$$$ - Failed to obtain client access token - $$$")
             # get list client token (Site.Selected) for list operations
             try:
-                # use master tenant id for tenant
-                self._list_token = self._get_list_token(
-                    client_config["sp_client_id"], client_config["sp_client_secret"], master_config["sp_tenant_id"],master_config["sp_domain_name"]
-                )
+                client_id, client_secret = self._get_client_credentials()
+                self._list_token = self._get_list_token(client_id, client_secret, master_config["sp_tenant_id"], master_config["sp_domain_name"])
                 logger.info("$$$ - Obtained client list token for SharePoint list operations - $$$")
             except Exception:
                 logger.info("$$$ - Failed to obtain client list token - $$$")
             self._connected = True
             logger.info("SharePointSource connected for site: %s", self._site_url)
             return True
@@ -324,10 +361,9 @@ class SharePointSource(DataSourceBase):
                     results = []
                     items = self._fetch_list_items_via_rest(relative_path)
-                    if str(self.config.get("sp_client_config",{}).get("sp_download_config",{})["sp_client_id"]):
-                        self._access_token = self._get_token(
-                                self.config.get("sp_client_config",{}).get("sp_download_config",{})["sp_client_id"], self.config.get("sp_client_config",{}).get("sp_download_config",{})["sp_client_secret"], self.config.get("sp_master_config",{})["sp_tenant_id"]
-                            )
+                    client_id, client_secret = self._get_download_credentials()
+                    self._access_token = self._get_token(client_id, client_secret, self.config.get("sp_master_config",{})["sp_tenant_id"])
                     #test running with hardcoded items
                     if False:
                         items = []
@@ -350,12 +386,29 @@ class SharePointSource(DataSourceBase):
                         })
                     for item in items:
-                        item_name = item.get("Title")
-                        item_display_name = item.get("SiteDisplayName")
+                        #the path after [Shared Documents/] in relative path
+                        item_relative_path = item.get("RelativePath") or item.get("relativepath") or item.get("relativePath")
+                        item_name = item.get("Title") or item.get("title")
+                        item_display_name = item.get("SiteDisplayName") or item.get("sitedisplayname") or item.get("siteDisplayName")
+                        # Check ModifiedDate filter
+                        # "2024-01-15" → 10 chars || "20240115" → 8 chars
+                        modified_date_str = item.get("ModifiedDate") or item.get("modifieddate") or item.get("modifiedDate")
+                        if modified_date_str:
+                            try:
+                                modified_date = datetime.fromisoformat(modified_date_str.replace('Z', '+00:00'))
+                                if datetime.now(modified_date.tzinfo) - modified_date < timedelta(days=1):
+                                    continue
+                            except Exception:
+                                pass
+                        if not item_relative_path:
+                            logger.warning("Item missing RelativePath: %s", item)
+                            continue
                         #get site id and drive id for this item
                         self._resolve_site_and_drive(item_display_name)
-                        #the path after [Shared Documents/] in relative path
-                        item_relative_path = item.get("RelativePath")
                         try:
                             content, filename = self._download_file_bytes(item_relative_path)
                             saved = self._save_file_if_requested(content, filename, save_path)

datasourcelib/datasources/sql_source.py CHANGED Viewed

@@ -16,8 +16,16 @@ class SQLDataSource(DataSourceBase):
         self._is_sqlite = False
     def validate_config(self) -> bool:
+        """
+        Validate config. If sql_windows_auth is True then sql_username/sql_password are optional.
+        Otherwise require sql_username and sql_password.
+        """
         try:
-            require_keys(self.config, ["sql_server","sql_database","sql_username","sql_password","sql_is_onprem"])
+            # Always require server/database at minimum
+            require_keys(self.config, ["sql_server", "sql_database"])
+            # If not using Windows authentication, require credentials
+            if not bool(self.config.get("sql_windows_auth", False)):
+                require_keys(self.config, ["sql_username", "sql_password"])
             return True
         except Exception as ex:
             logger.error("SQLDataSource.validate_config: %s", ex)
@@ -27,22 +35,31 @@ class SQLDataSource(DataSourceBase):
         try:
             sql_server = self.config.get("sql_server", "")
             sql_database = self.config.get("sql_database", "")
-            sql_username = self.config.get("sql_username", "")
-            sql_password = self.config.get("sql_password", "")
             sql_is_onprem = self.config.get("sql_is_onprem", False)
+            # Determine auth mode: sql_windows_auth (Trusted Connection) overrides username/password
+            sql_windows_auth = bool(self.config.get("sql_windows_auth", False))
             # Get available driver
             sql_driver = self._get_available_driver()
-            # Build connection string with appropriate encryption settings
+            # Build connection string
             conn_params = [
                 f'DRIVER={sql_driver}',
                 f'SERVER={sql_server}',
                 f'DATABASE={sql_database}',
-                f'UID={sql_username}',
-                f'PWD={sql_password}'
             ]
+            if sql_windows_auth:
+                # Use integrated Windows authentication (Trusted Connection)
+                # This will use the current process credentials / kerberos ticket.
+                conn_params.append('Trusted_Connection=yes')
+                logger.info("SQLDataSource using Windows (integrated) authentication")
+            else:
+                sql_username = self.config.get("sql_username", "")
+                sql_password = self.config.get("sql_password", "")
+                conn_params.extend([f'UID={sql_username}', f'PWD={sql_password}'])
             # Add encryption settings based on environment
             if not sql_is_onprem:
                 conn_params.extend([
@@ -56,13 +73,13 @@ class SQLDataSource(DataSourceBase):
                 ])
             conn_str = ';'.join(conn_params)
             # Attempt connection with timeout
             self._conn = pyodbc.connect(conn_str, timeout=30)
             self._connected = True
-            logger.info("SQLDataSource connected to %s using driver %s", sql_server, sql_driver)
+            logger.info("SQLDataSource connected to %s using driver %s (sql_windows_auth=%s)", sql_server, sql_driver, sql_windows_auth)
             return True
         except pyodbc.Error as ex:
             logger.error("SQLDataSource.connect failed - ODBC Error: %s", ex)
             self._connected = False

datasourcelib/indexes/azure_search_index.py CHANGED Viewed

@@ -110,7 +110,7 @@ class AzureSearchIndexer:
             logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
             raise
-    def _build_vector_search_config(self):
+    def _build_vector_search_config_old(self):
         AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
         vector_config = self.config.get("vector_config", {})
         dimensions = vector_config.get("dimensions", 1536)
@@ -121,6 +121,107 @@ class AzureSearchIndexer:
             )
         return vector_search, dimensions
+    def _build_vector_search_config(self):
+        AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
+        vector_config = self.config.get("vector_config", {})
+        dimensions = vector_config.get("dimensions", 1536)
+        algorithm = vector_config.get("algorithm", "hnsw").lower()
+        # Build algorithm configuration (SDK model if available)
+        alg_cfg = HnswAlgorithmConfiguration(name="algorithms-config-1")
+        # Build vectorizer settings using Azure OpenAI config from vector_db_config
+        deployment = self.config.get("embedding_deployment")
+        endpoint = self.config.get("embedding_endpoint")
+        api_key = self.config.get("embedding_key")
+        # modelName required for API version 2025-09-01 — prefer explicit embedding_model, fall back to deployment
+        model_name = self.config.get("embedding_model") or deployment
+        content_field = self.config.get("content_field", "content")
+        vector_field = self.config.get("vector_field", "contentVector")
+        if not model_name:
+            raise RuntimeError("Vectorizer configuration requires 'embedding_model' or 'embedding_deployment' in vector_db_config")
+        # Define vectorizer with explicit name and required azureOpenAIParameters including modelName
+        vectorizer_name = "azure-openai-vectorizer"
+        vectorizer = {
+            "name": vectorizer_name,
+            "kind": "azureOpenAI",
+            "azureOpenAIParameters": {
+                "resourceUri": endpoint.rstrip('/') if endpoint else None,
+                # include both modelName (required) and deploymentId (if provided)
+                "modelName": model_name,
+                **({"deploymentId": deployment} if deployment else {}),
+                "apiKey": api_key
+            },
+            "options": {
+                "fieldMapping": [
+                    {
+                        "sourceContext": f"/document/{content_field}",
+                        "outputs": [
+                            {
+                                "targetContext": f"/document/{vector_field}",
+                                "targetDimensions": dimensions
+                            }
+                        ]
+                    }
+                ]
+            }
+        }
+        profile_name = "vector-profile-1"
+        try:
+            # Create profile with vectorizer reference (SDK may expect vectorizer_name or vectorizer depending on version)
+            try:
+                profile = VectorSearchProfile(
+                    name=profile_name,
+                    algorithm_configuration_name="algorithms-config-1",
+                    vectorizer_name=vectorizer_name
+                )
+            except TypeError:
+                # fallback if SDK constructor uses different parameter names
+                profile = VectorSearchProfile(name=profile_name, algorithm_configuration_name="algorithms-config-1")
+                try:
+                    setattr(profile, "vectorizer_name", vectorizer_name)
+                except Exception:
+                    pass
+            try:
+                # Construct full vector search config with both profile and vectorizer
+                vector_search = VectorSearch(
+                    profiles=[profile],
+                    algorithms=[alg_cfg],
+                    vectorizers=[vectorizer]
+                )
+            except Exception:
+                # Fallback to dict if SDK constructor differs
+                vector_search = {
+                    "profiles": [{
+                        "name": profile_name,
+                        "algorithmConfigurationName": "algorithms-config-1",
+                        "vectorizerName": vectorizer_name
+                    }],
+                    "algorithms": [{"name": "algorithms-config-1"}],
+                    "vectorizers": [vectorizer]
+                }
+        except Exception:
+            # Full dict fallback
+            vector_search = {
+                "profiles": [{
+                    "name": profile_name,
+                    "algorithmConfigurationName": "algorithms-config-1",
+                    "vectorizerName": vectorizer_name
+                }],
+                "algorithms": [{"name": "algorithms-config-1"}],
+                "vectorizers": [vectorizer]
+            }
+        logger.info("Built vector_search config (dimensions=%s, model=%s, vectorizer=%s)",
+                    dimensions, model_name, vectorizer_name)
+        return vector_search, dimensions
     def _build_semantic_settings(self):
         """

datasourcelib/strategies/full_load.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from datasourcelib.core.sync_base import SyncBase
 from datasourcelib.utils.logger import get_logger
-from datasourcelib.indexes.azure_search_index_vector import AzureSearchIndexer
+from datasourcelib.indexes.azure_search_index import AzureSearchIndexer
 logger = get_logger(__name__)
 class FullLoadStrategy(SyncBase):

{datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.4
 Name: datasourcelib
-Version: 0.1.3
+Version: 0.1.5
 Summary: Data source sync strategies for vector DBs
-Home-page: https://github.com/jaiprakash0217/datasourcelib
-Author: Jai Prakash
-Author-email: jai.prakash@jai12ka4.com
+Home-page: https://github.com/akashmaurya0217/datasourcelib
+Author: Akash Kumar Maurya
+Author-email: mrelectronicsarduino@gmail.com
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License

{datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/RECORD RENAMED Viewed

@@ -4,20 +4,19 @@ datasourcelib/core/sync_base.py,sha256=AfwwaV3rJOFKVmKKpSj-BwznnCDCaeuT4LLNDfA3N
 datasourcelib/core/sync_manager.py,sha256=lj070S3PwSNcB0UL_ZDzDAm6uJ9G38TY491vQZ1dL3o,3849
 datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
 datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
-datasourcelib/datasources/azure_devops_source.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
+datasourcelib/datasources/azure_devops_source copy.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
+datasourcelib/datasources/azure_devops_source.py,sha256=3hyZIrUdgwZEQNjb2iZGDMJcAw3Z6r7oV0hWAq_zMsg,8005
 datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
 datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
 datasourcelib/datasources/datasource_types.py,sha256=eEiWymYS05X_TxwuB7P3MpphPG1En67h3kRiSGeHjQ0,176
 datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
-datasourcelib/datasources/sharepoint_source.py,sha256=Pv9735Gu2FylVeeT9e_cZlCvgGUwxn-pVRRZQe2PHU8,20196
-datasourcelib/datasources/sql_source.py,sha256=sCYHrmeD82fQVcdQjL9Y2TTTjaqlv2v8B5noAng3Bl4,5450
+datasourcelib/datasources/sharepoint_source.py,sha256=t3rly2mVEI2qEDuUVqstck5ktkZW0BnF16Bke_NjPLI,23126
+datasourcelib/datasources/sql_source.py,sha256=ntZjiFXpa7V797x7mAATJV0LH-g878VHuRw-QTxEe28,6372
 datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
-datasourcelib/indexes/azure_search_index.py,sha256=o3BoSxURBk5jCC3AlNz-v9_igg-dXYS4yUxXZwSfqFg,17265
-datasourcelib/indexes/azure_search_index_only.py,sha256=SulrYPehWGaf3Wi_Dw8UvFneSY-UwEK9viVYXwIlQuI,7120
-datasourcelib/indexes/azure_search_index_vector.py,sha256=4By1vJHv1ORiWOpTqO5wR0sTrq1TaEHP6t8MoOINhok,13410
+datasourcelib/indexes/azure_search_index.py,sha256=kznAz06UXgyT1Clqj6gRhnBQ5HFw40ZQHJElRFIcbRo,22115
 datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
 datasourcelib/strategies/daily_load.py,sha256=Rh-veUhxKYsplwHTyko_Zp9C6NkUJV5VAGtg-p7Iy34,856
-datasourcelib/strategies/full_load.py,sha256=zqDZZcmyJKXQ4v3coq5njjadlBNI9V8f_lfXVZCoLbQ,1698
+datasourcelib/strategies/full_load.py,sha256=U1a9wO_ZLRnMInvU0IRW-ZKnhu0Cv437VcNMKIYuzMA,1691
 datasourcelib/strategies/incremental_load.py,sha256=TVqmDLu3m571nqGvzo_69i36QtYe4sBpllFwfPNL0TE,1178
 datasourcelib/strategies/ondemand_load.py,sha256=VxzAYgrW2ebTOC3xm61CerL2AFehZUJLnKrqtGRGJoE,644
 datasourcelib/strategies/timerange_load.py,sha256=c62BN2yXwVFaA_dQV54qenP4vrb4rcFqbx6m-nqhaTA,900
@@ -27,8 +26,8 @@ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4f
 datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
 datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
 datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
-datasourcelib-0.1.3.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
-datasourcelib-0.1.3.dist-info/METADATA,sha256=cPVrPEkPN22sTYOoO20byXcpu5hvKVQIPu3elgyyEko,1185
-datasourcelib-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-datasourcelib-0.1.3.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
-datasourcelib-0.1.3.dist-info/RECORD,,
+datasourcelib-0.1.5.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
+datasourcelib-0.1.5.dist-info/METADATA,sha256=jDGgTdya-zt_go_TpEOJNfTQUI7CsbjM4m-Fg51XdqU,1199
+datasourcelib-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+datasourcelib-0.1.5.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
+datasourcelib-0.1.5.dist-info/RECORD,,

datasourcelib/indexes/azure_search_index_only.py DELETED Viewed

@@ -1,162 +0,0 @@
-from typing import List, Dict, Any, Optional
-from datasourcelib.utils.logger import get_logger
-logger = get_logger(__name__)
-class AzureSearchIndexer:
-    """
-    Minimal Azure Cognitive Search indexer wrapper.
-    Expects vector_db_config with:
-      - service_endpoint: str
-      - index_name: str
-      - api_key: str
-    Optional:
-      - key_field: name of unique key in documents (default 'id')
-    """
-    def __init__(self, vector_db_config: Dict[str, Any]):
-        self.config = vector_db_config or {}
-        self._client = None
-        self._index_client = None
-    def validate_config(self) -> bool:
-        required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
-        missing = [k for k in required if k not in self.config]
-        if missing:
-            logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
-            return False
-        return True
-    def _ensure_sdk(self):
-        try:
-            from azure.core.credentials import AzureKeyCredential  # type: ignore
-            from azure.search.documents import SearchClient  # type: ignore
-            from azure.search.documents.indexes import SearchIndexClient  # type: ignore
-            from azure.search.documents.indexes.models import (
-                SearchIndex,
-                SimpleField,
-                SearchableField,
-                SearchFieldDataType,
-            )  # type: ignore
-        except Exception as e:
-            raise RuntimeError("azure-search-documents package is required: install azure-search-documents") from e
-        return AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType
-    def _infer_field_type(self, value) -> Any:
-        """
-        Map Python types to SearchFieldDataType
-        """
-        *_, SearchFieldDataType = self._ensure_sdk()
-        if value is None:
-            return SearchFieldDataType.String
-        t = type(value)
-        if t is str:
-            return SearchFieldDataType.String
-        if t is bool:
-            return SearchFieldDataType.Boolean
-        if t is int:
-            return SearchFieldDataType.Int32
-        if t is float:
-            return SearchFieldDataType.Double
-        # fallback to string
-        return SearchFieldDataType.String
-    def _build_fields(self, sample: Dict[str, Any], key_field: str):
-        AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
-        fields = []
-        # ensure key field present
-        if key_field not in sample:
-            # we'll create a string key, uploader will populate unique ids
-            fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
-        else:
-            typ = self._infer_field_type(sample[key_field])
-            fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
-        for k, v in sample.items():
-            logger.info(f"================={k}============")
-            if k == key_field:
-                continue
-            typ = self._infer_field_type(v)
-            # for strings use SearchableField so full text queries work
-            if typ == SearchFieldDataType.String:
-                fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
-            else:
-                fields.append(SimpleField(name=k, type=typ))
-        return fields
-    def create_index(self, sample: Dict[str, Any]) -> bool:
-        try:
-            AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
-            endpoint = self.config["aisearch_endpoint"]
-            api_key = self.config["aisearch_api_key"]
-            index_name = self.config["aisearch_index_name"]
-            key_field = self.config.get("key_field", "id")
-            index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
-            fields = self._build_fields(sample, key_field)
-            logger.info("=================Creating Index============")
-            index = SearchIndex(name=index_name, fields=fields)
-            # create or update index
-            index_client.create_or_update_index(index)
-            logger.info("Azure Search index '%s' created/updated", index_name)
-            return True
-        except Exception as ex:
-            logger.exception("AzureSearchIndexer.create_index failed")
-            return False
-    def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
-        try:
-            AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
-            endpoint = self.config["aisearch_endpoint"]
-            api_key = self.config["aisearch_api_key"]
-            index_name = self.config["aisearch_index_name"]
-            key_field = self.config.get("key_field", "id")
-            # ensure each doc has key_field
-            from uuid import uuid4
-            for d in docs:
-                if key_field not in d:
-                    d[key_field] = str(uuid4())
-            # ensure each doc has key_field is of string type
-            for d in docs:
-                if key_field in d:
-                    typ = self._infer_field_type(d[key_field])
-                    if typ != SearchFieldDataType.String:
-                        d[key_field] = str(d[key_field])
-            client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
-            logger.info("Uploading %d documents to index %s", len(docs), index_name)
-            result = client.upload_documents(documents=docs)
-            # Check results for failures
-            failed = [r for r in result if not r.succeeded]
-            if failed:
-                logger.error("Some documents failed to upload: %s", failed)
-                return False
-            logger.info("Uploaded documents successfully")
-            return True
-        except Exception:
-            logger.exception("AzureSearchIndexer.upload_documents failed")
-            return False
-    def index(self, rows: List[Dict[str, Any]]) -> bool:
-        """
-        High level: create index (based on first row) and upload all rows.
-        """
-        if not rows:
-            logger.error("AzureSearchIndexer.index called with empty rows")
-            return False
-        try:
-            if not self.validate_config():
-                return False
-            sample = rows[0]
-            logger.info(f"================={sample}============")
-            ok = self.create_index(sample)
-            if not ok:
-                return False
-            ok2 = self.upload_documents(rows)
-            return ok2
-        except Exception:
-            logger.exception("AzureSearchIndexer.index failed")
-            return False

datasourcelib/indexes/azure_search_index_vector.py DELETED Viewed

@@ -1,286 +0,0 @@
-from typing import List, Dict, Any, Optional
-from datasourcelib.utils.logger import get_logger
-logger = get_logger(__name__)
-class AzureSearchIndexer:
-    """
-    Azure Cognitive Search indexer with vector search support.
-    Required vector_db_config:
-      - aisearch_endpoint: str
-      - aisearch_index_name: str
-      - aisearch_api_key
-    Optional vector search config:
-      - vectorization: bool (enable vector search)
-      - vector_config: dict
-        - dimensions: int (default 1024)
-        - algorithm: str ('hnsw' or 'flat', default 'hnsw')
-        - metric: str ('cosine', 'euclidean', 'dotProduct', default 'cosine')
-      - key_field: str (default 'id')
-      - vector_field: str (default 'contentVector')
-      - embedding_endpoint: str (Azure OpenAI endpoint for embeddings)
-      - embedding_key: str (Azure OpenAI API key)
-      - embedding_deployment: str (Azure OpenAI model deployment name)
-    """
-    def __init__(self, vector_db_config: Dict[str, Any]):
-        self.config = vector_db_config or {}
-        self._client = None
-        self._index_client = None
-        self._embedding_client = None
-    def validate_config(self) -> bool:
-        required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
-        missing = [k for k in required if k not in self.config]
-        # Check vector search requirements if enabled
-        if self.config.get("vectorization", False):
-            vector_required = ("embedding_endpoint", "embedding_key", "embedding_deployment")
-            missing.extend([k for k in vector_required if k not in self.config])
-        if missing:
-            logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
-            return False
-        return True
-    def _ensure_sdk(self):
-        try:
-            from azure.core.credentials import AzureKeyCredential  # type: ignore
-            from azure.search.documents import SearchClient  # type: ignore
-            from azure.search.documents.indexes import SearchIndexClient  # type: ignore
-            from openai import AzureOpenAI # type: ignore
-            from azure.search.documents.indexes.models import (
-                SearchIndex,
-                SearchField,
-                SearchFieldDataType,
-                SimpleField,
-                SearchableField,
-                VectorSearch,
-                VectorSearchProfile,
-                HnswAlgorithmConfiguration
-            ) # type: ignore
-        except Exception as e:
-            raise RuntimeError("Required packages missing. Install: azure-search-documents openai") from e
-        return (
-            AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration
-        )
-    def _setup_embedding_client(self):
-        if not self._embedding_client and self.config.get("vectorization"):
-            try:
-                AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
-                self._embedding_client = AzureOpenAI(
-                    api_version=self.config["embedding_api_version"],
-                    azure_endpoint=self.config["embedding_endpoint"],
-                    api_key=self.config["embedding_key"],
-                )
-                logger.info("Azure OpenAI embedding client initialized")
-            except Exception as ex:
-                logger.exception("Failed to initialize embedding client")
-                raise
-    def _get_embeddings(self, text: str) -> List[float]:
-        try:
-            self._setup_embedding_client()
-            response = self._embedding_client.embeddings.create(
-                model=self.config["embedding_deployment"],
-                input=text
-            )
-            return response.data[0].embedding
-        except Exception as ex:
-            logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
-            raise
-    def _build_vector_search_config(self):
-        AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
-        vector_config = self.config.get("vector_config", {})
-        dimensions = vector_config.get("dimensions", 1536)
-        vector_search = VectorSearch(
-                profiles=[VectorSearchProfile(name="vector-profile-1", algorithm_configuration_name="algorithms-config-1")],
-                algorithms=[HnswAlgorithmConfiguration(name="algorithms-config-1")]
-            )
-        return vector_search, dimensions
-    def _infer_field_type(self, value) -> Any:
-        #Map Python types to SearchFieldDataType, including collections
-        AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
-        if value is None:
-            return SearchFieldDataType.String
-        t = type(value)
-        # Handle list/array types as Collections
-        if t in (list, tuple):
-            # If empty list, default to Collection of Double
-            if not value:
-                return SearchFieldDataType.Collection(SearchFieldDataType.Double)
-            # Get type of first element for non-empty lists
-            element_type = self._infer_field_type(value[0])
-            return SearchFieldDataType.Collection(element_type)
-        # Handle vector embeddings (list or tuple of floats)
-        if type(value) in (list, tuple) and all(isinstance(x, (int, float)) for x in value):
-            return SearchFieldDataType.Collection(SearchFieldDataType.Single)
-        # Handle basic types
-        logger.info(f"######## Infer field type for value:[ {value} ] of type [ {t} ]")
-        if t is bool:
-            return SearchFieldDataType.Boolean
-        if t is int:
-            return SearchFieldDataType.Int32
-        if t is float:
-            return SearchFieldDataType.Double
-        print(f"############## Infer field type for value: {value} of type {t}")
-        print(t is str)
-        if t is str:
-            return SearchFieldDataType.String
-        # fallback to string
-        logger.warning(f"Falling back to string type for value: {value} of type {t}")
-        return SearchFieldDataType.String
-    def _build_fields(self, sample: Dict[str, Any], key_field: str):
-        AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
-        fields = []
-        # Add key field
-        if key_field not in sample:
-            fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
-        else:
-            fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
-        # Add regular fields
-        for k, v in sample.items():
-            logger.info(f"================={k}============")
-            if k == key_field:
-                continue
-            logger.info(f"#### Infer field type for field: {k}")
-            typ = self._infer_field_type(v)
-            logger.info(f"#### Inferred type for field {k}: {typ}")
-            if typ == SearchFieldDataType.String:
-                fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
-            else:
-                fields.append(SimpleField(name=k, type=typ))
-        # Add vector field if vectorization is enabled
-        if self.config.get("vectorization"):
-            vector_field = self.config.get("vector_field", "contentVector")
-            _, dimensions = self._build_vector_search_config()
-            fields.append(
-                SearchField(
-                    name=vector_field,
-                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
-                    searchable=True,
-                    vector_search_dimensions=dimensions,
-                    vector_search_profile_name="vector-profile-1"
-                )
-            )
-        return fields
-    def create_index(self, sample: Dict[str, Any]) -> bool:
-        try:
-            AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
-            endpoint = self.config["aisearch_endpoint"]
-            api_key = self.config["aisearch_api_key"]
-            index_name = self.config["aisearch_index_name"]
-            key_field = self.config.get("key_field", "id")
-            index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
-            fields = self._build_fields(sample, key_field)
-            # Create index with vector search if enabled
-            if self.config.get("vectorization"):
-                vector_search, _ = self._build_vector_search_config()
-                index = SearchIndex(
-                    name=index_name,
-                    fields=fields,
-                    vector_search=vector_search
-                )
-            else:
-                index = SearchIndex(name=index_name, fields=fields)
-            index_client.create_or_update_index(index)
-            logger.info(f"Azure Search index '{index_name}' created/updated with vectorization={self.config.get('vectorization', False)}")
-            return True
-        except Exception as ex:
-            logger.exception("AzureSearchIndexer.create_index failed")
-            return False
-    def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
-        try:
-            AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
-            endpoint = self.config["aisearch_endpoint"]
-            api_key = self.config["aisearch_api_key"]
-            index_name = self.config["aisearch_index_name"]
-            key_field = self.config.get("key_field", "id")
-            # Add IDs if missing
-            from uuid import uuid4
-            for d in docs:
-                if key_field not in d:
-                    d[key_field] = str(uuid4())
-                elif not isinstance(d[key_field], str):
-                    d[key_field] = str(d[key_field])
-            # Add vector embeddings if enabled
-            if self.config.get("vectorization"):
-                vector_field = self.config.get("vector_field", "contentVector")
-                content_field = self.config.get("content_field", "content")
-                for doc in docs:
-                    if content_field in doc:
-                        try:
-                            embedding = self._get_embeddings(str(doc[content_field]))
-                            doc[vector_field] = embedding
-                        except Exception as e:
-                            logger.error(f"Failed to get embedding for document {doc.get(key_field)}: {str(e)}")
-                            continue
-            client = SearchClient(endpoint=endpoint, index_name=index_name,
-                               credential=AzureKeyCredential(api_key))
-            logger.info(f"Uploading {len(docs)} documents to index {index_name}")
-            result = client.upload_documents(documents=docs)
-            failed = [r for r in result if not r.succeeded]
-            if failed:
-                logger.error(f"Some documents failed to upload: {failed}")
-                return False
-            logger.info("Documents uploaded successfully")
-            return True
-        except Exception:
-            logger.exception("AzureSearchIndexer.upload_documents failed")
-            return False
-    def index(self, rows: List[Dict[str, Any]]) -> bool:
-        """High level: create index (based on first row) and upload all rows."""
-        if not rows:
-            logger.error("AzureSearchIndexer.index called with empty rows")
-            return False
-        try:
-            if not self.validate_config():
-                return False
-            sample = rows[0]
-            logger.info(f"Creating/updating index with sample: {sample}")
-            ok = self.create_index(sample)
-            if not ok:
-                return False
-            ok2 = self.upload_documents(rows)
-            return ok2
-        except Exception:
-            logger.exception("AzureSearchIndexer.index failed")
-            return False

{datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

datasourcelib 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

datasourcelib 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl