datasourcelib 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasourcelib/core/sync_manager.py +3 -1
- datasourcelib/datasources/azure_devops_source copy.py +126 -0
- datasourcelib/datasources/azure_devops_source.py +122 -64
- datasourcelib/datasources/datasource_types.py +2 -1
- datasourcelib/datasources/dataverse_source.py +291 -0
- datasourcelib/datasources/sharepoint_source.py +79 -26
- datasourcelib/datasources/sql_source.py +29 -12
- datasourcelib/datasources/sql_source_bkup.py +159 -0
- {datasourcelib-0.1.4.dist-info → datasourcelib-0.1.6.dist-info}/METADATA +4 -4
- {datasourcelib-0.1.4.dist-info → datasourcelib-0.1.6.dist-info}/RECORD +13 -10
- {datasourcelib-0.1.4.dist-info → datasourcelib-0.1.6.dist-info}/WHEEL +0 -0
- {datasourcelib-0.1.4.dist-info → datasourcelib-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {datasourcelib-0.1.4.dist-info → datasourcelib-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ from ..datasources.sql_source import SQLDataSource
|
|
|
10
10
|
from ..datasources.azure_devops_source import AzureDevOpsSource
|
|
11
11
|
from ..datasources.sharepoint_source import SharePointSource
|
|
12
12
|
from ..datasources.blob_source import BlobStorageSource
|
|
13
|
+
from ..datasources.dataverse_source import DataverseSource
|
|
13
14
|
|
|
14
15
|
# concrete strategies
|
|
15
16
|
from datasourcelib.strategies.full_load import FullLoadStrategy
|
|
@@ -35,7 +36,8 @@ class SyncManager:
|
|
|
35
36
|
DataSourceType.SQL: SQLDataSource,
|
|
36
37
|
DataSourceType.AZURE_DEVOPS: AzureDevOpsSource,
|
|
37
38
|
DataSourceType.SHAREPOINT: SharePointSource,
|
|
38
|
-
DataSourceType.BLOB_STORAGE: BlobStorageSource
|
|
39
|
+
DataSourceType.BLOB_STORAGE: BlobStorageSource,
|
|
40
|
+
DataSourceType.Dataverse: DataverseSource
|
|
39
41
|
}
|
|
40
42
|
|
|
41
43
|
def execute_sync(self, sync_type: SyncType,
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
from datasourcelib.datasources.datasource_base import DataSourceBase
|
|
3
|
+
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
from datasourcelib.utils.validators import require_keys
|
|
5
|
+
import base64
|
|
6
|
+
import json
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import requests # type: ignore
|
|
13
|
+
except Exception:
|
|
14
|
+
requests = None # lazy import handled at runtime
|
|
15
|
+
|
|
16
|
+
class AzureDevOpsSource(DataSourceBase):
|
|
17
|
+
|
|
18
|
+
def validate_config(self) -> bool:
|
|
19
|
+
try:
|
|
20
|
+
require_keys(self.config, ["ado_organization", "ado_personal_access_token","ado_project","ado_query_id"])
|
|
21
|
+
return True
|
|
22
|
+
except Exception as ex:
|
|
23
|
+
logger.error("AzureDevOpsSource.validate_config: %s", ex)
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
def connect(self) -> bool:
|
|
27
|
+
if requests is None:
|
|
28
|
+
raise RuntimeError("requests package is required for AzureDevOpsSource")
|
|
29
|
+
# No persistent connection; store auth header
|
|
30
|
+
pat = self.config.get("ado_personal_access_token")
|
|
31
|
+
token = pat
|
|
32
|
+
token_b64 = base64.b64encode(token.encode("utf-8")).decode("utf-8")
|
|
33
|
+
self._headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
34
|
+
self._connected = True
|
|
35
|
+
logger.info("AzureDevOpsSource ready (no persistent connection required)")
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
def disconnect(self) -> None:
|
|
39
|
+
self._headers = {}
|
|
40
|
+
self._connected = False
|
|
41
|
+
logger.info("AzureDevOpsSource cleared")
|
|
42
|
+
|
|
43
|
+
def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
|
|
44
|
+
if requests is None:
|
|
45
|
+
raise RuntimeError("requests package is required for AzureDevOpsSource")
|
|
46
|
+
if not getattr(self, "_connected", False):
|
|
47
|
+
self.connect()
|
|
48
|
+
|
|
49
|
+
org = self.config.get("ado_organization")
|
|
50
|
+
project = self.config.get("ado_project")
|
|
51
|
+
query_id = self.config.get("ado_query_id")
|
|
52
|
+
api_version = self.config.get("api_version", "7.1")
|
|
53
|
+
#path = self.config.get("query_path", query or "")
|
|
54
|
+
if not query_id:
|
|
55
|
+
raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
|
|
56
|
+
|
|
57
|
+
base = f"https://dev.azure.com/{org}/"
|
|
58
|
+
if project:
|
|
59
|
+
base = f"{base}{project}/"
|
|
60
|
+
url = f"{base}_apis/wit/wiql/{query_id}"
|
|
61
|
+
params = {"api-version": api_version}
|
|
62
|
+
method = self.config.get("method", "GET").upper()
|
|
63
|
+
query_response = requests.request(method, url, headers=getattr(self, "_headers", {}), params=params) #, json=self.config.get("payload")
|
|
64
|
+
query_response.raise_for_status()
|
|
65
|
+
#data = resp.json()
|
|
66
|
+
# Check if the request was successful
|
|
67
|
+
if query_response.status_code == 200:
|
|
68
|
+
work_items = query_response.json()['workItems']
|
|
69
|
+
work_item_details = []
|
|
70
|
+
|
|
71
|
+
# Loop through each work item ID to get detailed information
|
|
72
|
+
for item in work_items:
|
|
73
|
+
work_item_id = item['id']
|
|
74
|
+
work_item_url = f'https://dev.azure.com/{org}/{project}/_apis/wit/workitems/{work_item_id}?api-version=7.1'
|
|
75
|
+
work_item_response = requests.get(work_item_url, headers=getattr(self, "_headers", {}))
|
|
76
|
+
|
|
77
|
+
if work_item_response.status_code == 200:
|
|
78
|
+
logger.info(f"Current Item: {work_item_id}")
|
|
79
|
+
text = work_item_response.json()['fields']['System.Description']
|
|
80
|
+
c_desc=BeautifulSoup(text, "html.parser").get_text()
|
|
81
|
+
c_changedate = work_item_response.json()['fields']['System.ChangedDate']
|
|
82
|
+
c_title = work_item_response.json()['fields']['System.Title']
|
|
83
|
+
c_status = work_item_response.json()['fields']['System.State']
|
|
84
|
+
c_type = work_item_response.json()['fields']['System.WorkItemType']
|
|
85
|
+
c_created = work_item_response.json()['fields']['System.CreatedDate']
|
|
86
|
+
|
|
87
|
+
default_value = "-VALUE NOT ASSIGNED-"
|
|
88
|
+
c_assigned = work_item_response.json()['fields'].get('System.AssignedTo',{}).get('displayName',default_value)
|
|
89
|
+
logger.info(c_assigned)
|
|
90
|
+
c_tags = work_item_response.json()['fields'].get('System.Tags',default_value)
|
|
91
|
+
c_project = work_item_response.json()['fields'].get('Custom.ProjectName',default_value)
|
|
92
|
+
c_rtype = work_item_response.json()['fields'].get('Custom.Releasetype',default_value)
|
|
93
|
+
c_rdate = work_item_response.json()['fields'].get('Microsoft.VSTS.Scheduling.TargetDate',default_value)
|
|
94
|
+
|
|
95
|
+
#fullfeature = f"{c_type} ID {work_item_id} was created on {c_created} for a {c_rtype} release of Project '{c_project}' with target date '{c_rdate}' and has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned} and last modified on {c_changedate}.Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
|
|
96
|
+
fullfeature = f"{c_type} ID {work_item_id} was created on {c_created}. {c_type} ID {work_item_id} is a {c_rtype} release of Project '{c_project}'. {c_type} ID {work_item_id} Release has target date '{c_rdate}'.{c_type} ID {work_item_id} has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned}. {c_type} ID {work_item_id} is last modified on {c_changedate}. Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
|
|
97
|
+
# Ensure work_item_details is a list and append a dict for this work item
|
|
98
|
+
|
|
99
|
+
work_item_details.append({
|
|
100
|
+
"id": work_item_id,
|
|
101
|
+
"type": c_type,
|
|
102
|
+
"title": c_title,
|
|
103
|
+
"status": c_status,
|
|
104
|
+
"assigned_to": c_assigned,
|
|
105
|
+
"created": c_created,
|
|
106
|
+
"changed_date": c_changedate,
|
|
107
|
+
"tags": c_tags,
|
|
108
|
+
"release_type": c_rtype,
|
|
109
|
+
"target_date": c_rdate,
|
|
110
|
+
"project": c_project,
|
|
111
|
+
"description": c_desc,
|
|
112
|
+
"full": fullfeature
|
|
113
|
+
})
|
|
114
|
+
else:
|
|
115
|
+
logger.error(f"Error fetching details for work item ID {work_item_id}: {work_item_response.status_code}")
|
|
116
|
+
|
|
117
|
+
#work_item_desc = []
|
|
118
|
+
#for desc in work_item_details:
|
|
119
|
+
# work_item_desc.append(desc['fields']['System.Description'])
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
return work_item_details #[{"response": json.dumps(work_item_details)}]
|
|
123
|
+
else:
|
|
124
|
+
raise RuntimeError(f"Error: {query_response.status_code}")
|
|
125
|
+
# Caller decides how to interpret the payload; default: return raw json in a single-item list
|
|
126
|
+
|
|
@@ -50,77 +50,135 @@ class AzureDevOpsSource(DataSourceBase):
|
|
|
50
50
|
project = self.config.get("ado_project")
|
|
51
51
|
query_id = self.config.get("ado_query_id")
|
|
52
52
|
api_version = self.config.get("api_version", "7.1")
|
|
53
|
-
#path = self.config.get("query_path", query or "")
|
|
54
53
|
if not query_id:
|
|
55
54
|
raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
|
|
56
55
|
|
|
57
56
|
base = f"https://dev.azure.com/{org}/"
|
|
58
57
|
if project:
|
|
59
58
|
base = f"{base}{project}/"
|
|
60
|
-
|
|
59
|
+
# WIQL query by id (returns list of work item refs)
|
|
60
|
+
wiql_url = f"{base}_apis/wit/wiql/{query_id}"
|
|
61
61
|
params = {"api-version": api_version}
|
|
62
62
|
method = self.config.get("method", "GET").upper()
|
|
63
|
-
query_response = requests.request(method,
|
|
63
|
+
query_response = requests.request(method, wiql_url, headers=getattr(self, "_headers", {}), params=params)
|
|
64
64
|
query_response.raise_for_status()
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
if query_response.status_code == 200:
|
|
68
|
-
work_items = query_response.json()['workItems']
|
|
69
|
-
work_item_details = []
|
|
70
|
-
|
|
71
|
-
# Loop through each work item ID to get detailed information
|
|
72
|
-
for item in work_items:
|
|
73
|
-
work_item_id = item['id']
|
|
74
|
-
work_item_url = f'https://dev.azure.com/{org}/{project}/_apis/wit/workitems/{work_item_id}?api-version=7.1'
|
|
75
|
-
work_item_response = requests.get(work_item_url, headers=getattr(self, "_headers", {}))
|
|
76
|
-
|
|
77
|
-
if work_item_response.status_code == 200:
|
|
78
|
-
logger.info(f"Current Item: {work_item_id}")
|
|
79
|
-
text = work_item_response.json()['fields']['System.Description']
|
|
80
|
-
c_desc=BeautifulSoup(text, "html.parser").get_text()
|
|
81
|
-
c_changedate = work_item_response.json()['fields']['System.ChangedDate']
|
|
82
|
-
c_title = work_item_response.json()['fields']['System.Title']
|
|
83
|
-
c_status = work_item_response.json()['fields']['System.State']
|
|
84
|
-
c_type = work_item_response.json()['fields']['System.WorkItemType']
|
|
85
|
-
c_created = work_item_response.json()['fields']['System.CreatedDate']
|
|
86
|
-
|
|
87
|
-
default_value = "-VALUE NOT ASSIGNED-"
|
|
88
|
-
c_assigned = work_item_response.json()['fields'].get('System.AssignedTo',{}).get('displayName',default_value)
|
|
89
|
-
logger.info(c_assigned)
|
|
90
|
-
c_tags = work_item_response.json()['fields'].get('System.Tags',default_value)
|
|
91
|
-
c_project = work_item_response.json()['fields'].get('Custom.ProjectName',default_value)
|
|
92
|
-
c_rtype = work_item_response.json()['fields'].get('Custom.Releasetype',default_value)
|
|
93
|
-
c_rdate = work_item_response.json()['fields'].get('Microsoft.VSTS.Scheduling.TargetDate',default_value)
|
|
94
|
-
|
|
95
|
-
#fullfeature = f"{c_type} ID {work_item_id} was created on {c_created} for a {c_rtype} release of Project '{c_project}' with target date '{c_rdate}' and has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned} and last modified on {c_changedate}.Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
|
|
96
|
-
fullfeature = f"{c_type} ID {work_item_id} was created on {c_created}. {c_type} ID {work_item_id} is a {c_rtype} release of Project '{c_project}'. {c_type} ID {work_item_id} Release has target date '{c_rdate}'.{c_type} ID {work_item_id} has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned}. {c_type} ID {work_item_id} is last modified on {c_changedate}. Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
|
|
97
|
-
# Ensure work_item_details is a list and append a dict for this work item
|
|
98
|
-
|
|
99
|
-
work_item_details.append({
|
|
100
|
-
"id": work_item_id,
|
|
101
|
-
"type": c_type,
|
|
102
|
-
"title": c_title,
|
|
103
|
-
"status": c_status,
|
|
104
|
-
"assigned_to": c_assigned,
|
|
105
|
-
"created": c_created,
|
|
106
|
-
"changed_date": c_changedate,
|
|
107
|
-
"tags": c_tags,
|
|
108
|
-
"release_type": c_rtype,
|
|
109
|
-
"target_date": c_rdate,
|
|
110
|
-
"project": c_project,
|
|
111
|
-
"description": c_desc,
|
|
112
|
-
"full": fullfeature
|
|
113
|
-
})
|
|
114
|
-
else:
|
|
115
|
-
logger.error(f"Error fetching details for work item ID {work_item_id}: {work_item_response.status_code}")
|
|
116
|
-
|
|
117
|
-
#work_item_desc = []
|
|
118
|
-
#for desc in work_item_details:
|
|
119
|
-
# work_item_desc.append(desc['fields']['System.Description'])
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
return work_item_details #[{"response": json.dumps(work_item_details)}]
|
|
123
|
-
else:
|
|
65
|
+
|
|
66
|
+
if query_response.status_code != 200:
|
|
124
67
|
raise RuntimeError(f"Error: {query_response.status_code}")
|
|
125
|
-
|
|
126
|
-
|
|
68
|
+
|
|
69
|
+
work_items_refs = query_response.json().get('workItems', []) or []
|
|
70
|
+
if not work_items_refs:
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
# collect ids and fetch details in batch to get all fields for all work item types
|
|
74
|
+
ids = [str(item.get('id')) for item in work_items_refs if item.get('id')]
|
|
75
|
+
if not ids:
|
|
76
|
+
return []
|
|
77
|
+
|
|
78
|
+
details_url = f"https://dev.azure.com/{org}/{project}/_apis/wit/workitems"
|
|
79
|
+
# expand=all to include fields, relations, and attachments
|
|
80
|
+
params = {
|
|
81
|
+
"ids": ",".join(ids),
|
|
82
|
+
"api-version": api_version,
|
|
83
|
+
"$expand": "all"
|
|
84
|
+
}
|
|
85
|
+
details_resp = requests.get(details_url, headers=getattr(self, "_headers", {}), params=params)
|
|
86
|
+
details_resp.raise_for_status()
|
|
87
|
+
items = details_resp.json().get("value", [])
|
|
88
|
+
|
|
89
|
+
work_item_details: List[Dict[str, Any]] = []
|
|
90
|
+
for item in items:
|
|
91
|
+
item_id = item.get("id")
|
|
92
|
+
fields = item.get("fields", {}) or {}
|
|
93
|
+
|
|
94
|
+
# Normalize field keys to safe snake_case-like keys
|
|
95
|
+
norm_fields: Dict[str, Any] = {}
|
|
96
|
+
for k, v in fields.items():
|
|
97
|
+
nk = k.replace(".", "_")
|
|
98
|
+
nk = nk.lower()
|
|
99
|
+
norm_fields[nk] = v
|
|
100
|
+
|
|
101
|
+
# Helper to safely extract nested displayName for assigned to
|
|
102
|
+
assigned = norm_fields.get("system_assignedto")
|
|
103
|
+
if isinstance(assigned, dict):
|
|
104
|
+
assigned_to = assigned.get("displayName") or assigned.get("uniqueName") or str(assigned)
|
|
105
|
+
else:
|
|
106
|
+
assigned_to = assigned
|
|
107
|
+
|
|
108
|
+
# find a description-like field (some types use different field names)
|
|
109
|
+
desc = ""
|
|
110
|
+
for fk in ["system_description", "microsoft_vsts_createdby", "html_description"]:
|
|
111
|
+
if fk in norm_fields:
|
|
112
|
+
desc = norm_fields.get(fk) or ""
|
|
113
|
+
break
|
|
114
|
+
if not desc:
|
|
115
|
+
# fallback: first field key that contains 'description'
|
|
116
|
+
for kf, vf in norm_fields.items():
|
|
117
|
+
if "description" in kf and vf:
|
|
118
|
+
desc = vf
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
# clean HTML description to text
|
|
122
|
+
try:
|
|
123
|
+
c_desc = BeautifulSoup(desc or "", "html.parser").get_text()
|
|
124
|
+
except Exception:
|
|
125
|
+
c_desc = desc or ""
|
|
126
|
+
|
|
127
|
+
# Build common convenience values (use available fields)
|
|
128
|
+
wi_type = norm_fields.get("system_workitemtype") or norm_fields.get("system_witype") or ""
|
|
129
|
+
title = norm_fields.get("system_title") or ""
|
|
130
|
+
status = norm_fields.get("system_state") or ""
|
|
131
|
+
created = norm_fields.get("system_createddate") or norm_fields.get("system_created") or ""
|
|
132
|
+
changed = norm_fields.get("system_changeddate") or norm_fields.get("system_changed") or ""
|
|
133
|
+
tags = norm_fields.get("system_tags", "")
|
|
134
|
+
project_name = norm_fields.get("custom.projectname") or norm_fields.get("system_teamproject") or ""
|
|
135
|
+
|
|
136
|
+
rtype = norm_fields.get("custom.releasetype") or norm_fields.get("custom_releasetype") or ""
|
|
137
|
+
target_date = norm_fields.get("microsoft_vsts_scheduling_targetdate") or norm_fields.get("microsoft.vsts.scheduling.targetdate") or ""
|
|
138
|
+
|
|
139
|
+
# Construct a 'full' description string using available pieces
|
|
140
|
+
parts = []
|
|
141
|
+
if wi_type:
|
|
142
|
+
parts.append(f"{wi_type} ID {item_id}")
|
|
143
|
+
else:
|
|
144
|
+
parts.append(f"WorkItem {item_id}")
|
|
145
|
+
if created:
|
|
146
|
+
parts.append(f"was created on {created}")
|
|
147
|
+
if title:
|
|
148
|
+
parts.append(f"and has Title '{title}'")
|
|
149
|
+
if status:
|
|
150
|
+
parts.append(f"is currently in {status} state")
|
|
151
|
+
if assigned_to:
|
|
152
|
+
parts.append(f"is assigned to {assigned_to}")
|
|
153
|
+
if project_name:
|
|
154
|
+
parts.append(f"for Project '{project_name}'")
|
|
155
|
+
if rtype:
|
|
156
|
+
parts.append(f"release type '{rtype}'")
|
|
157
|
+
if target_date:
|
|
158
|
+
parts.append(f"with target date '{target_date}'")
|
|
159
|
+
if tags:
|
|
160
|
+
parts.append(f"Tags: {tags}")
|
|
161
|
+
if c_desc:
|
|
162
|
+
parts.append(f"Description: [{c_desc}]")
|
|
163
|
+
fullfeature = ". ".join(parts)
|
|
164
|
+
|
|
165
|
+
# include all normalized fields in the returned object for completeness
|
|
166
|
+
entry = {
|
|
167
|
+
"id": item_id,
|
|
168
|
+
"type": wi_type,
|
|
169
|
+
"title": title,
|
|
170
|
+
"status": status,
|
|
171
|
+
"assigned_to": assigned_to,
|
|
172
|
+
"created": created,
|
|
173
|
+
"changed_date": changed,
|
|
174
|
+
"tags": tags,
|
|
175
|
+
"project": project_name,
|
|
176
|
+
"release_type": rtype,
|
|
177
|
+
"target_date": target_date,
|
|
178
|
+
"description": c_desc,
|
|
179
|
+
"full": fullfeature,
|
|
180
|
+
"fields": norm_fields # full field set for this work item
|
|
181
|
+
}
|
|
182
|
+
work_item_details.append(entry)
|
|
183
|
+
|
|
184
|
+
return work_item_details
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
2
|
+
from datasourcelib.datasources.datasource_base import DataSourceBase
|
|
3
|
+
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
from datasourcelib.utils.validators import require_keys
|
|
5
|
+
import pyodbc
|
|
6
|
+
import time
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
# optional requests import (webapi mode)
|
|
10
|
+
try:
|
|
11
|
+
import requests # type: ignore
|
|
12
|
+
except Exception:
|
|
13
|
+
requests = None # lazy import
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
class DataverseSource(DataSourceBase):
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: Dict[str, Any]):
|
|
20
|
+
super().__init__(config)
|
|
21
|
+
self._conn = None
|
|
22
|
+
self._mode = (self.config.get("dv_mode") or "tds").lower() # "tds" or "webapi"
|
|
23
|
+
self._access_token: Optional[str] = None
|
|
24
|
+
self._headers: Dict[str, str] = {}
|
|
25
|
+
self._max_retries = int(self.config.get("dv_max_retries", 3))
|
|
26
|
+
|
|
27
|
+
def validate_config(self) -> bool:
|
|
28
|
+
"""
|
|
29
|
+
Validate required keys depending on selected dv_mode.
|
|
30
|
+
- tds: requires either 'tds_connection_string' OR ('dataverse_server' and 'dataverse_database')
|
|
31
|
+
- webapi: requires 'webapi_url','client_id','client_secret','tenant_id' (or 'resource')
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
if self._mode == "webapi":
|
|
35
|
+
require_keys(self.config, ["dv_webapi_url", "dv_webapi_client_id", "dv_webapi_client_secret", "dv_webapi_tenant_id"])
|
|
36
|
+
else:
|
|
37
|
+
# TDS mode (ODBC)
|
|
38
|
+
if "dv_tds_connection_string" in self.config:
|
|
39
|
+
return True
|
|
40
|
+
# otherwise require components
|
|
41
|
+
require_keys(self.config, ["dv_tds_server", "dv_tds_database"])
|
|
42
|
+
# if not using integrated auth require creds
|
|
43
|
+
if not bool(self.config.get("dv_tds_windows_auth", False)):
|
|
44
|
+
require_keys(self.config, ["dv_tds_username", "dv_tds_password"])
|
|
45
|
+
return True
|
|
46
|
+
except Exception as ex:
|
|
47
|
+
logger.error("DataverseSource.validate_config failed: %s", ex)
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
# -------------------------
|
|
51
|
+
# Connection helpers
|
|
52
|
+
# -------------------------
|
|
53
|
+
def _get_available_driver(self) -> str:
|
|
54
|
+
"""Return first suitable ODBC driver for SQL/Dataverse TDS access."""
|
|
55
|
+
preferred_drivers = [
|
|
56
|
+
"ODBC Driver 18 for SQL Server",
|
|
57
|
+
"ODBC Driver 17 for SQL Server",
|
|
58
|
+
"SQL Server Native Client 11.0",
|
|
59
|
+
"SQL Server"
|
|
60
|
+
]
|
|
61
|
+
try:
|
|
62
|
+
drivers = pyodbc.drivers()
|
|
63
|
+
logger.info("Available ODBC drivers: %s", drivers)
|
|
64
|
+
|
|
65
|
+
for d in preferred_drivers:
|
|
66
|
+
if d in drivers:
|
|
67
|
+
logger.info("Using ODBC driver: %s", d)
|
|
68
|
+
return d
|
|
69
|
+
|
|
70
|
+
# fallback to first available
|
|
71
|
+
if drivers:
|
|
72
|
+
logger.warning("No preferred driver found. Using: %s", drivers[0])
|
|
73
|
+
return drivers[0]
|
|
74
|
+
raise RuntimeError("No ODBC drivers available")
|
|
75
|
+
except Exception as ex:
|
|
76
|
+
logger.error("DataverseSource._get_available_driver failed: %s", ex)
|
|
77
|
+
raise
|
|
78
|
+
|
|
79
|
+
def _build_tds_conn_str(self) -> str:
|
|
80
|
+
"""Build valid connection string with proper parameter names."""
|
|
81
|
+
if "dv_tds_connection_string" in self.config:
|
|
82
|
+
return self.config["dv_tds_connection_string"]
|
|
83
|
+
|
|
84
|
+
driver = self._get_available_driver()
|
|
85
|
+
# Fix: use correct config key names (dv_tds_server, not dv_tds_dataverse_server)
|
|
86
|
+
server = self.config.get("dv_tds_server", "").strip()
|
|
87
|
+
database = self.config.get("dv_tds_database", "").strip()
|
|
88
|
+
|
|
89
|
+
if not server:
|
|
90
|
+
raise ValueError("dv_tds_server are required")
|
|
91
|
+
|
|
92
|
+
logger.info("Building TDS connection (driver=%s, server=%s, database=%s)", driver, server, database)
|
|
93
|
+
|
|
94
|
+
# Use curly braces for driver name (handles spaces in driver names)
|
|
95
|
+
parts = [f"DRIVER={{{driver}}}"]
|
|
96
|
+
parts.append(f"Server={server}")
|
|
97
|
+
parts.append(f"Database={database}")
|
|
98
|
+
password = None
|
|
99
|
+
if bool(self.config.get("dv_tds_windows_auth", False)):
|
|
100
|
+
parts.append("Trusted_Connection=yes")
|
|
101
|
+
logger.info("Using Windows authentication")
|
|
102
|
+
else:
|
|
103
|
+
username = self.config.get("dv_tds_username", "").strip()
|
|
104
|
+
password = self.config.get("dv_tds_password", "").strip()
|
|
105
|
+
|
|
106
|
+
if not username or not password:
|
|
107
|
+
raise ValueError("dv_tds_username and dv_tds_password required when Windows auth disabled")
|
|
108
|
+
|
|
109
|
+
parts.append(f"UID={username}")
|
|
110
|
+
parts.append(f"PWD={password}")
|
|
111
|
+
parts.append("Authentication=ActiveDirectoryInteractive")
|
|
112
|
+
# Encryption settings
|
|
113
|
+
if not bool(self.config.get("dv_tds_is_onprem", False)):
|
|
114
|
+
parts.append("Encrypt=yes")
|
|
115
|
+
parts.append("TrustServerCertificate=no")
|
|
116
|
+
else:
|
|
117
|
+
parts.append("Encrypt=optional")
|
|
118
|
+
parts.append("TrustServerCertificate=yes")
|
|
119
|
+
|
|
120
|
+
conn_str = ";".join(parts)
|
|
121
|
+
logger.debug("Connection string: %s", conn_str.replace(password or "", "***") if password else conn_str)
|
|
122
|
+
return conn_str
|
|
123
|
+
|
|
124
|
+
def _obtain_webapi_token(self) -> Tuple[str, Dict[str, str]]:
|
|
125
|
+
"""
|
|
126
|
+
Acquire OAuth2 token using client credentials flow.
|
|
127
|
+
Returns (access_token, headers)
|
|
128
|
+
Config expected keys: tenant_id, client_id, client_secret, optional resource
|
|
129
|
+
"""
|
|
130
|
+
if requests is None:
|
|
131
|
+
raise RuntimeError("requests package required for Dataverse Web API mode")
|
|
132
|
+
tenant = self.config["dv_webapi_tenant_id"]
|
|
133
|
+
client_id = self.config["dv_webapi_client_id"]
|
|
134
|
+
client_secret = self.config["dv_webapi_client_secret"]
|
|
135
|
+
# resource or scope: prefer explicit resource, else fallback to webapi_url host
|
|
136
|
+
resource = self.config.get("dv_webapi_resource")
|
|
137
|
+
if not resource:
|
|
138
|
+
# infer resource from webapi_url e.g. https://<org>.crm.dynamics.com
|
|
139
|
+
webapi_url = self.config["dv_webapi_url"].rstrip("/")
|
|
140
|
+
resource = webapi_url.split("://")[-1]
|
|
141
|
+
resource = f"https://{resource}" # as resource
|
|
142
|
+
token_url = f"https://login.microsoftonline.com/{tenant}/oauth2/v2.0/token"
|
|
143
|
+
data = {
|
|
144
|
+
"grant_type": "client_credentials",
|
|
145
|
+
"client_id": client_id,
|
|
146
|
+
"client_secret": client_secret,
|
|
147
|
+
"scope": f"{resource}/.default"
|
|
148
|
+
}
|
|
149
|
+
resp = requests.post(token_url, data=data, timeout=30)
|
|
150
|
+
resp.raise_for_status()
|
|
151
|
+
j = resp.json()
|
|
152
|
+
token = j.get("access_token")
|
|
153
|
+
if not token:
|
|
154
|
+
raise RuntimeError("Failed to obtain access token for Dataverse webapi")
|
|
155
|
+
headers = {"Authorization": f"Bearer {token}", "Accept": "application/json", "OData-MaxVersion": "4.0", "OData-Version": "4.0"}
|
|
156
|
+
return token, headers
|
|
157
|
+
|
|
158
|
+
# -------------------------
|
|
159
|
+
# Public connection API
|
|
160
|
+
# -------------------------
|
|
161
|
+
def connect(self) -> bool:
|
|
162
|
+
try:
|
|
163
|
+
if self._mode == "webapi":
|
|
164
|
+
token, headers = self._obtain_webapi_token()
|
|
165
|
+
self._access_token = token
|
|
166
|
+
self._headers = headers
|
|
167
|
+
self._connected = True
|
|
168
|
+
logger.info("DataverseSource connected (webapi mode) to %s", self.config.get("dv_webapi_url"))
|
|
169
|
+
return True
|
|
170
|
+
# else TDS mode
|
|
171
|
+
conn_str = self._build_tds_conn_str()
|
|
172
|
+
self._conn = pyodbc.connect(conn_str, timeout=int(self.config.get("dv_tds_timeout", 30)))
|
|
173
|
+
self._connected = True
|
|
174
|
+
logger.info("DataverseSource connected (dv_tds mode) to %s/%s", self.config.get("dv_server"), self.config.get("dv_database"))
|
|
175
|
+
return True
|
|
176
|
+
except pyodbc.Error as ex:
|
|
177
|
+
logger.error("DataverseSource.connect failed - ODBC Error: %s", ex)
|
|
178
|
+
self._connected = False
|
|
179
|
+
return False
|
|
180
|
+
except requests.RequestException as ex:
|
|
181
|
+
logger.error("DataverseSource.connect failed - HTTP Error: %s", ex)
|
|
182
|
+
self._connected = False
|
|
183
|
+
return False
|
|
184
|
+
except Exception as ex:
|
|
185
|
+
logger.exception("DataverseSource.connect failed")
|
|
186
|
+
self._connected = False
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
def disconnect(self) -> None:
|
|
190
|
+
try:
|
|
191
|
+
if self._conn:
|
|
192
|
+
try:
|
|
193
|
+
self._conn.close()
|
|
194
|
+
except Exception:
|
|
195
|
+
pass
|
|
196
|
+
self._conn = None
|
|
197
|
+
self._access_token = None
|
|
198
|
+
self._headers = {}
|
|
199
|
+
finally:
|
|
200
|
+
self._connected = False
|
|
201
|
+
logger.info("DataverseSource disconnected")
|
|
202
|
+
|
|
203
|
+
# -------------------------
|
|
204
|
+
# Data fetch
|
|
205
|
+
# -------------------------
|
|
206
|
+
def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
|
|
207
|
+
"""
|
|
208
|
+
Fetch rows from Dataverse.
|
|
209
|
+
- TDS mode: executes SQL query (config key 'tds_query' or provided 'query')
|
|
210
|
+
- WebAPI mode: calls Dataverse Web API path fragment (e.g. 'accounts?$select=name') or uses 'entity_set' + query params
|
|
211
|
+
Returns list[dict].
|
|
212
|
+
"""
|
|
213
|
+
attempt = 0
|
|
214
|
+
while attempt < self._max_retries:
|
|
215
|
+
try:
|
|
216
|
+
if not getattr(self, "_connected", False):
|
|
217
|
+
ok = self.connect()
|
|
218
|
+
if not ok:
|
|
219
|
+
raise RuntimeError("DataverseSource: cannot connect")
|
|
220
|
+
|
|
221
|
+
if self._mode == "webapi":
|
|
222
|
+
if requests is None:
|
|
223
|
+
raise RuntimeError("requests package required for webapi mode")
|
|
224
|
+
webapi_url = self.config["dv_webapi_url"].rstrip("/")
|
|
225
|
+
# if query provided, treat it as path fragment; else use entity_set from config
|
|
226
|
+
path_fragment = query or self.config.get("dv_webapi_entity_set")
|
|
227
|
+
if not path_fragment:
|
|
228
|
+
raise ValueError("DataverseSource.fetch_data requires a webapi 'query' or 'entity_set' in config")
|
|
229
|
+
url = f"{webapi_url}/api/data/v9.1/{path_fragment.lstrip('/')}"
|
|
230
|
+
params = kwargs.get("params")
|
|
231
|
+
resp = requests.get(url, headers=self._headers, params=params, timeout=60)
|
|
232
|
+
resp.raise_for_status()
|
|
233
|
+
j = resp.json()
|
|
234
|
+
items: Any = []
|
|
235
|
+
# Dataverse OData responses typically use 'value' for collections
|
|
236
|
+
if isinstance(j, dict) and "value" in j:
|
|
237
|
+
items = j["value"]
|
|
238
|
+
# otherwise return the raw json wrapped in a list or as-is
|
|
239
|
+
elif isinstance(j, list):
|
|
240
|
+
items= j
|
|
241
|
+
else:
|
|
242
|
+
items= [j]
|
|
243
|
+
|
|
244
|
+
df = pd.DataFrame(items)
|
|
245
|
+
# filter columns if configured
|
|
246
|
+
keep = self.config.get("dv_webapi_columns_to_keep")
|
|
247
|
+
if isinstance(keep, list) and keep:
|
|
248
|
+
cols_to_keep = [c for c in df.columns if c in keep]
|
|
249
|
+
else:
|
|
250
|
+
# exclude SharePoint metadata columns (start with '__' or prefixed with '@')
|
|
251
|
+
cols_to_keep = [c for c in df.columns if not str(c).startswith("__") and not str(c).startswith("@")]
|
|
252
|
+
df = df[cols_to_keep]
|
|
253
|
+
results = df.to_dict("records")
|
|
254
|
+
return results
|
|
255
|
+
# else TDS mode
|
|
256
|
+
sql = query or self.config.get("dv_tds_query") or self.config.get("dv_sql_query")
|
|
257
|
+
if not sql:
|
|
258
|
+
raise ValueError("DataverseSource.fetch_data requires a SQL query (tds mode)")
|
|
259
|
+
|
|
260
|
+
cur = self._conn.cursor()
|
|
261
|
+
try:
|
|
262
|
+
cur.execute(sql)
|
|
263
|
+
cols = [c[0] for c in (cur.description or [])]
|
|
264
|
+
rows = cur.fetchall()
|
|
265
|
+
results: List[Dict[str, Any]] = []
|
|
266
|
+
for r in rows:
|
|
267
|
+
results.append({cols[i]: r[i] for i in range(len(cols))})
|
|
268
|
+
return results
|
|
269
|
+
finally:
|
|
270
|
+
try:
|
|
271
|
+
cur.close()
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
|
|
275
|
+
except Exception as ex:
|
|
276
|
+
attempt += 1
|
|
277
|
+
logger.warning("DataverseSource.fetch_data attempt %d/%d failed: %s", attempt, self._max_retries, ex)
|
|
278
|
+
# transient retry for network/connection errors
|
|
279
|
+
if attempt >= self._max_retries:
|
|
280
|
+
logger.exception("DataverseSource.fetch_data final failure")
|
|
281
|
+
raise
|
|
282
|
+
# backoff
|
|
283
|
+
time.sleep(min(2 ** attempt, 10))
|
|
284
|
+
# try reconnect for next attempt
|
|
285
|
+
try:
|
|
286
|
+
self.disconnect()
|
|
287
|
+
except Exception:
|
|
288
|
+
pass
|
|
289
|
+
|
|
290
|
+
# unreachable; defensive
|
|
291
|
+
return []
|
|
@@ -7,6 +7,7 @@ import requests
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import os
|
|
9
9
|
from uuid import uuid4
|
|
10
|
+
from datetime import datetime, timedelta
|
|
10
11
|
|
|
11
12
|
logger = get_logger(__name__)
|
|
12
13
|
reader = ByteReader()
|
|
@@ -114,50 +115,86 @@ class SharePointSource(DataSourceBase):
|
|
|
114
115
|
self._drive_id = drives[0].get("id")
|
|
115
116
|
logger.info("Resolved SharePoint drive ID: %s", self._drive_id)
|
|
116
117
|
|
|
118
|
+
def _get_client_credentials(self) -> Tuple[str, str]:
|
|
119
|
+
"""Retrieve client credentials in order of priority: sp_download_config, sp_client_config, sp_master_config."""
|
|
120
|
+
# Fallback to sp_client_config
|
|
121
|
+
sp_client_config = self.config.get("sp_client_config", {})
|
|
122
|
+
client_id = sp_client_config.get("sp_client_id")
|
|
123
|
+
client_secret = sp_client_config.get("sp_client_secret")
|
|
124
|
+
|
|
125
|
+
if not client_id or not client_secret:
|
|
126
|
+
# Fallback to sp_master_config
|
|
127
|
+
sp_master_config = self.config.get("sp_master_config", {})
|
|
128
|
+
client_id = client_id or sp_master_config.get("sp_client_id")
|
|
129
|
+
client_secret = client_secret or sp_master_config.get("sp_client_secret")
|
|
130
|
+
|
|
131
|
+
if not client_id or not client_secret:
|
|
132
|
+
raise ValueError("Client ID and Client Secret must be provided in the configuration.")
|
|
133
|
+
|
|
134
|
+
return client_id, client_secret
|
|
135
|
+
|
|
136
|
+
def _get_download_credentials(self) -> Tuple[str, str]:
|
|
137
|
+
"""Retrieve client credentials in order of priority: sp_download_config, sp_client_config, sp_master_config."""
|
|
138
|
+
# Check sp_download_config first
|
|
139
|
+
sp_download_config = self.config.get("sp_client_config", {}).get("sp_download_config", {})
|
|
140
|
+
client_id = sp_download_config.get("sp_client_id")
|
|
141
|
+
client_secret = sp_download_config.get("sp_client_secret")
|
|
142
|
+
|
|
143
|
+
if not client_id or not client_secret:
|
|
144
|
+
# Fallback to sp_client_config
|
|
145
|
+
sp_client_config = self.config.get("sp_client_config", {})
|
|
146
|
+
client_id = client_id or sp_client_config.get("sp_client_id")
|
|
147
|
+
client_secret = client_secret or sp_client_config.get("sp_client_secret")
|
|
148
|
+
|
|
149
|
+
if not client_id or not client_secret:
|
|
150
|
+
# Fallback to sp_master_config
|
|
151
|
+
sp_master_config = self.config.get("sp_master_config", {})
|
|
152
|
+
client_id = client_id or sp_master_config.get("sp_client_id")
|
|
153
|
+
client_secret = client_secret or sp_master_config.get("sp_client_secret")
|
|
154
|
+
|
|
155
|
+
if not client_id or not client_secret:
|
|
156
|
+
raise ValueError("Client ID and Client Secret must be provided in the configuration.")
|
|
157
|
+
|
|
158
|
+
return client_id, client_secret
|
|
159
|
+
|
|
160
|
+
|
|
117
161
|
def connect(self) -> bool:
|
|
118
162
|
try:
|
|
119
163
|
# basic values
|
|
120
164
|
self._site_url = self.config["sp_site_url"]
|
|
121
|
-
client_config = self.config["sp_client_config"]
|
|
122
165
|
master_config = self.config["sp_master_config"]
|
|
123
166
|
|
|
124
167
|
# get master token (Sites.Read.All)
|
|
125
168
|
try:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
)
|
|
169
|
+
master_client_id = master_config["sp_client_id"]
|
|
170
|
+
master_client_secret = master_config["sp_client_secret"]
|
|
171
|
+
self._master_token = self._get_token(master_client_id, master_client_secret, master_config["sp_tenant_id"])
|
|
129
172
|
logger.info("$$$ - Obtained master access token for SharePoint - $$$")
|
|
130
173
|
except Exception as ex:
|
|
131
174
|
logger.info("$$$ - Failed to obtain master token - $$$")
|
|
132
175
|
|
|
133
176
|
# resolve site and drive ids
|
|
134
177
|
try:
|
|
135
|
-
self._resolve_site_and_drive(
|
|
136
|
-
self.config['sp_site_display_name']
|
|
137
|
-
)
|
|
178
|
+
self._resolve_site_and_drive(self.config['sp_site_display_name'])
|
|
138
179
|
except Exception:
|
|
139
180
|
logger.info("$$$ - Failed to resolve site/drive - $$$")
|
|
140
|
-
|
|
181
|
+
|
|
141
182
|
# get client token (Site.Selected) for download operations
|
|
142
183
|
try:
|
|
143
|
-
|
|
144
|
-
self._access_token = self._get_token(
|
|
145
|
-
client_config["sp_client_id"], client_config["sp_client_secret"], master_config["sp_tenant_id"]
|
|
146
|
-
)
|
|
184
|
+
client_id, client_secret = self._get_client_credentials()
|
|
185
|
+
self._access_token = self._get_token(client_id, client_secret, master_config["sp_tenant_id"])
|
|
147
186
|
logger.info("$$$ - Obtained client access token for SharePoint downloads - $$$")
|
|
148
187
|
except Exception:
|
|
149
188
|
logger.info("$$$ - Failed to obtain client access token - $$$")
|
|
150
|
-
|
|
189
|
+
|
|
151
190
|
# get list client token (Site.Selected) for list operations
|
|
152
191
|
try:
|
|
153
|
-
|
|
154
|
-
self._list_token = self._get_list_token(
|
|
155
|
-
client_config["sp_client_id"], client_config["sp_client_secret"], master_config["sp_tenant_id"],master_config["sp_domain_name"]
|
|
156
|
-
)
|
|
192
|
+
client_id, client_secret = self._get_client_credentials()
|
|
193
|
+
self._list_token = self._get_list_token(client_id, client_secret, master_config["sp_tenant_id"], master_config["sp_domain_name"])
|
|
157
194
|
logger.info("$$$ - Obtained client list token for SharePoint list operations - $$$")
|
|
158
195
|
except Exception:
|
|
159
196
|
logger.info("$$$ - Failed to obtain client list token - $$$")
|
|
160
|
-
|
|
197
|
+
|
|
161
198
|
self._connected = True
|
|
162
199
|
logger.info("SharePointSource connected for site: %s", self._site_url)
|
|
163
200
|
return True
|
|
@@ -324,10 +361,9 @@ class SharePointSource(DataSourceBase):
|
|
|
324
361
|
results = []
|
|
325
362
|
items = self._fetch_list_items_via_rest(relative_path)
|
|
326
363
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
)
|
|
364
|
+
client_id, client_secret = self._get_download_credentials()
|
|
365
|
+
|
|
366
|
+
self._access_token = self._get_token(client_id, client_secret, self.config.get("sp_master_config",{})["sp_tenant_id"])
|
|
331
367
|
#test running with hardcoded items
|
|
332
368
|
if False:
|
|
333
369
|
items = []
|
|
@@ -350,12 +386,29 @@ class SharePointSource(DataSourceBase):
|
|
|
350
386
|
})
|
|
351
387
|
|
|
352
388
|
for item in items:
|
|
353
|
-
|
|
354
|
-
|
|
389
|
+
#the path after [Shared Documents/] in relative path
|
|
390
|
+
item_relative_path = item.get("RelativePath") or item.get("relativepath") or item.get("relativePath")
|
|
391
|
+
item_name = item.get("Title") or item.get("title")
|
|
392
|
+
item_display_name = item.get("SiteDisplayName") or item.get("sitedisplayname") or item.get("siteDisplayName")
|
|
393
|
+
|
|
394
|
+
# Check ModifiedDate filter
|
|
395
|
+
# "2024-01-15" → 10 chars || "20240115" → 8 chars
|
|
396
|
+
modified_date_str = item.get("ModifiedDate") or item.get("modifieddate") or item.get("modifiedDate")
|
|
397
|
+
if modified_date_str:
|
|
398
|
+
try:
|
|
399
|
+
modified_date = datetime.fromisoformat(modified_date_str.replace('Z', '+00:00'))
|
|
400
|
+
if datetime.now(modified_date.tzinfo) - modified_date < timedelta(days=1):
|
|
401
|
+
continue
|
|
402
|
+
except Exception:
|
|
403
|
+
pass
|
|
404
|
+
|
|
405
|
+
if not item_relative_path:
|
|
406
|
+
logger.warning("Item missing RelativePath: %s", item)
|
|
407
|
+
continue
|
|
408
|
+
|
|
355
409
|
#get site id and drive id for this item
|
|
356
410
|
self._resolve_site_and_drive(item_display_name)
|
|
357
|
-
|
|
358
|
-
item_relative_path = item.get("RelativePath")
|
|
411
|
+
|
|
359
412
|
try:
|
|
360
413
|
content, filename = self._download_file_bytes(item_relative_path)
|
|
361
414
|
saved = self._save_file_if_requested(content, filename, save_path)
|
|
@@ -16,8 +16,16 @@ class SQLDataSource(DataSourceBase):
|
|
|
16
16
|
self._is_sqlite = False
|
|
17
17
|
|
|
18
18
|
def validate_config(self) -> bool:
|
|
19
|
+
"""
|
|
20
|
+
Validate config. If sql_windows_auth is True then sql_username/sql_password are optional.
|
|
21
|
+
Otherwise require sql_username and sql_password.
|
|
22
|
+
"""
|
|
19
23
|
try:
|
|
20
|
-
|
|
24
|
+
# Always require server/database at minimum
|
|
25
|
+
require_keys(self.config, ["sql_server", "sql_database"])
|
|
26
|
+
# If not using Windows authentication, require credentials
|
|
27
|
+
if not bool(self.config.get("sql_windows_auth", False)):
|
|
28
|
+
require_keys(self.config, ["sql_username", "sql_password"])
|
|
21
29
|
return True
|
|
22
30
|
except Exception as ex:
|
|
23
31
|
logger.error("SQLDataSource.validate_config: %s", ex)
|
|
@@ -27,22 +35,31 @@ class SQLDataSource(DataSourceBase):
|
|
|
27
35
|
try:
|
|
28
36
|
sql_server = self.config.get("sql_server", "")
|
|
29
37
|
sql_database = self.config.get("sql_database", "")
|
|
30
|
-
sql_username = self.config.get("sql_username", "")
|
|
31
|
-
sql_password = self.config.get("sql_password", "")
|
|
32
38
|
sql_is_onprem = self.config.get("sql_is_onprem", False)
|
|
33
|
-
|
|
39
|
+
|
|
40
|
+
# Determine auth mode: sql_windows_auth (Trusted Connection) overrides username/password
|
|
41
|
+
sql_windows_auth = bool(self.config.get("sql_windows_auth", False))
|
|
42
|
+
|
|
34
43
|
# Get available driver
|
|
35
44
|
sql_driver = self._get_available_driver()
|
|
36
|
-
|
|
37
|
-
# Build connection string
|
|
45
|
+
|
|
46
|
+
# Build connection string
|
|
38
47
|
conn_params = [
|
|
39
48
|
f'DRIVER={sql_driver}',
|
|
40
49
|
f'SERVER={sql_server}',
|
|
41
50
|
f'DATABASE={sql_database}',
|
|
42
|
-
f'UID={sql_username}',
|
|
43
|
-
f'PWD={sql_password}'
|
|
44
51
|
]
|
|
45
|
-
|
|
52
|
+
|
|
53
|
+
if sql_windows_auth:
|
|
54
|
+
# Use integrated Windows authentication (Trusted Connection)
|
|
55
|
+
# This will use the current process credentials / kerberos ticket.
|
|
56
|
+
conn_params.append('Trusted_Connection=yes')
|
|
57
|
+
logger.info("SQLDataSource using Windows (integrated) authentication")
|
|
58
|
+
else:
|
|
59
|
+
sql_username = self.config.get("sql_username", "")
|
|
60
|
+
sql_password = self.config.get("sql_password", "")
|
|
61
|
+
conn_params.extend([f'UID={sql_username}', f'PWD={sql_password}'])
|
|
62
|
+
|
|
46
63
|
# Add encryption settings based on environment
|
|
47
64
|
if not sql_is_onprem:
|
|
48
65
|
conn_params.extend([
|
|
@@ -56,13 +73,13 @@ class SQLDataSource(DataSourceBase):
|
|
|
56
73
|
])
|
|
57
74
|
|
|
58
75
|
conn_str = ';'.join(conn_params)
|
|
59
|
-
|
|
76
|
+
|
|
60
77
|
# Attempt connection with timeout
|
|
61
78
|
self._conn = pyodbc.connect(conn_str, timeout=30)
|
|
62
79
|
self._connected = True
|
|
63
|
-
logger.info("SQLDataSource connected to %s using driver %s", sql_server, sql_driver)
|
|
80
|
+
logger.info("SQLDataSource connected to %s using driver %s (sql_windows_auth=%s)", sql_server, sql_driver, sql_windows_auth)
|
|
64
81
|
return True
|
|
65
|
-
|
|
82
|
+
|
|
66
83
|
except pyodbc.Error as ex:
|
|
67
84
|
logger.error("SQLDataSource.connect failed - ODBC Error: %s", ex)
|
|
68
85
|
self._connected = False
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
from datasourcelib.datasources.datasource_base import DataSourceBase
|
|
3
|
+
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
from datasourcelib.utils.validators import require_keys
|
|
5
|
+
import os
|
|
6
|
+
import pyodbc
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
class SQLDataSource(DataSourceBase):
|
|
12
|
+
|
|
13
|
+
def __init__(self, config: Dict[str, Any]):
|
|
14
|
+
super().__init__(config)
|
|
15
|
+
self._conn = None
|
|
16
|
+
self._is_sqlite = False
|
|
17
|
+
|
|
18
|
+
def validate_config(self) -> bool:
|
|
19
|
+
"""
|
|
20
|
+
Validate config. If sql_windows_auth is True then sql_username/sql_password are optional.
|
|
21
|
+
Otherwise require sql_username and sql_password.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
# Always require server/database at minimum
|
|
25
|
+
require_keys(self.config, ["sql_server", "sql_database"])
|
|
26
|
+
# If not using Windows authentication, require credentials
|
|
27
|
+
if not bool(self.config.get("sql_windows_auth", False)):
|
|
28
|
+
require_keys(self.config, ["sql_username", "sql_password"])
|
|
29
|
+
return True
|
|
30
|
+
except Exception as ex:
|
|
31
|
+
logger.error("SQLDataSource.validate_config: %s", ex)
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
def connect(self) -> bool:
|
|
35
|
+
try:
|
|
36
|
+
sql_server = self.config.get("sql_server", "")
|
|
37
|
+
sql_database = self.config.get("sql_database", "")
|
|
38
|
+
sql_is_onprem = self.config.get("sql_is_onprem", False)
|
|
39
|
+
|
|
40
|
+
# Determine auth mode: sql_windows_auth (Trusted Connection) overrides username/password
|
|
41
|
+
sql_windows_auth = bool(self.config.get("sql_windows_auth", False))
|
|
42
|
+
|
|
43
|
+
# Get available driver
|
|
44
|
+
sql_driver = self._get_available_driver()
|
|
45
|
+
|
|
46
|
+
# Build connection string
|
|
47
|
+
conn_params = [
|
|
48
|
+
f'DRIVER={sql_driver}',
|
|
49
|
+
f'SERVER={sql_server}',
|
|
50
|
+
f'DATABASE={sql_database}',
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
if sql_windows_auth:
|
|
54
|
+
# Use integrated Windows authentication (Trusted Connection)
|
|
55
|
+
# This will use the current process credentials / kerberos ticket.
|
|
56
|
+
conn_params.append('Trusted_Connection=yes')
|
|
57
|
+
logger.info("SQLDataSource using Windows (integrated) authentication")
|
|
58
|
+
else:
|
|
59
|
+
sql_username = self.config.get("sql_username", "")
|
|
60
|
+
sql_password = self.config.get("sql_password", "")
|
|
61
|
+
conn_params.extend([f'UID={sql_username}', f'PWD={sql_password}'])
|
|
62
|
+
|
|
63
|
+
# Add encryption settings based on environment
|
|
64
|
+
if not sql_is_onprem:
|
|
65
|
+
conn_params.extend([
|
|
66
|
+
'Encrypt=yes',
|
|
67
|
+
'TrustServerCertificate=no'
|
|
68
|
+
])
|
|
69
|
+
else:
|
|
70
|
+
conn_params.extend([
|
|
71
|
+
'Encrypt=optional',
|
|
72
|
+
'TrustServerCertificate=yes'
|
|
73
|
+
])
|
|
74
|
+
|
|
75
|
+
conn_str = ';'.join(conn_params)
|
|
76
|
+
|
|
77
|
+
# Attempt connection with timeout
|
|
78
|
+
self._conn = pyodbc.connect(conn_str, timeout=30)
|
|
79
|
+
self._connected = True
|
|
80
|
+
logger.info("SQLDataSource connected to %s using driver %s (sql_windows_auth=%s)", sql_server, sql_driver, sql_windows_auth)
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
except pyodbc.Error as ex:
|
|
84
|
+
logger.error("SQLDataSource.connect failed - ODBC Error: %s", ex)
|
|
85
|
+
self._connected = False
|
|
86
|
+
return False
|
|
87
|
+
except Exception as ex:
|
|
88
|
+
logger.error("SQLDataSource.connect failed - Unexpected Error: %s", ex)
|
|
89
|
+
self._connected = False
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
def disconnect(self) -> None:
|
|
93
|
+
try:
|
|
94
|
+
if self._conn:
|
|
95
|
+
self._conn.close()
|
|
96
|
+
finally:
|
|
97
|
+
self._conn = None
|
|
98
|
+
self._connected = False
|
|
99
|
+
logger.info("SQLDataSource disconnected")
|
|
100
|
+
|
|
101
|
+
def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
|
|
102
|
+
max_retries = 3
|
|
103
|
+
retry_count = 0
|
|
104
|
+
|
|
105
|
+
while retry_count < max_retries:
|
|
106
|
+
try:
|
|
107
|
+
if not self._connected:
|
|
108
|
+
ok = self.connect()
|
|
109
|
+
if not ok:
|
|
110
|
+
raise RuntimeError("SQLDataSource: not connected and cannot connect")
|
|
111
|
+
|
|
112
|
+
query = self.config.get("sql_query")
|
|
113
|
+
if not query:
|
|
114
|
+
raise ValueError("SQLDataSource.fetch_data requires a query")
|
|
115
|
+
|
|
116
|
+
cur = self._conn.cursor()
|
|
117
|
+
try:
|
|
118
|
+
cur.execute(query)
|
|
119
|
+
cols = [d[0] if hasattr(d, "__len__") else d[0] for d in (cur.description or [])]
|
|
120
|
+
rows = cur.fetchall()
|
|
121
|
+
results: List[Dict[str, Any]] = []
|
|
122
|
+
for r in rows:
|
|
123
|
+
results.append({cols[i]: r[i] for i in range(len(cols))})
|
|
124
|
+
return results
|
|
125
|
+
finally:
|
|
126
|
+
try:
|
|
127
|
+
cur.close()
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
except pyodbc.OperationalError as ex:
|
|
132
|
+
# Handle connection lost
|
|
133
|
+
retry_count += 1
|
|
134
|
+
logger.warning("Connection lost, attempt %d of %d: %s", retry_count, max_retries, ex)
|
|
135
|
+
self.disconnect()
|
|
136
|
+
if retry_count >= max_retries:
|
|
137
|
+
raise
|
|
138
|
+
except Exception as ex:
|
|
139
|
+
logger.error("Query execution failed: %s", ex)
|
|
140
|
+
raise
|
|
141
|
+
|
|
142
|
+
def _get_available_driver(self) -> str:
|
|
143
|
+
"""Get first available SQL Server driver from preferred list."""
|
|
144
|
+
preferred_drivers = [
|
|
145
|
+
'ODBC Driver 18 for SQL Server',
|
|
146
|
+
'ODBC Driver 17 for SQL Server',
|
|
147
|
+
'SQL Server Native Client 11.0',
|
|
148
|
+
'SQL Server'
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
available_drivers = pyodbc.drivers()
|
|
153
|
+
for driver in preferred_drivers:
|
|
154
|
+
if driver in available_drivers:
|
|
155
|
+
return driver
|
|
156
|
+
raise RuntimeError(f"No suitable SQL Server driver found. Available drivers: {available_drivers}")
|
|
157
|
+
except Exception as ex:
|
|
158
|
+
logger.error("Failed to get SQL drivers: %s", ex)
|
|
159
|
+
raise
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datasourcelib
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Data source sync strategies for vector DBs
|
|
5
|
-
Home-page: https://github.com/
|
|
6
|
-
Author:
|
|
7
|
-
Author-email:
|
|
5
|
+
Home-page: https://github.com/akashmaurya0217/datasourcelib
|
|
6
|
+
Author: Akash Kumar Maurya
|
|
7
|
+
Author-email: mrelectronicsarduino@gmail.com
|
|
8
8
|
Classifier: Development Status :: 3 - Alpha
|
|
9
9
|
Classifier: Intended Audience :: Developers
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
datasourcelib/__init__.py,sha256=I7JTSZ1J6ULg_TfdMEgFcd1regkCHuyKdZT4DcPtoyQ,78
|
|
2
2
|
datasourcelib/core/__init__.py,sha256=nsXojDd97T7eMqqtCsZr1qSYLBitvKydSZRb9Dg7hqU,462
|
|
3
3
|
datasourcelib/core/sync_base.py,sha256=AfwwaV3rJOFKVmKKpSj-BwznnCDCaeuT4LLNDfA3NAY,716
|
|
4
|
-
datasourcelib/core/sync_manager.py,sha256=
|
|
4
|
+
datasourcelib/core/sync_manager.py,sha256=pep3lS9GINzhOnwrMSPnOh5rfIsMbu8a0TEkTyq4yRk,3961
|
|
5
5
|
datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
|
|
6
6
|
datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
|
|
7
|
-
datasourcelib/datasources/azure_devops_source.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
|
|
7
|
+
datasourcelib/datasources/azure_devops_source copy.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
|
|
8
|
+
datasourcelib/datasources/azure_devops_source.py,sha256=3hyZIrUdgwZEQNjb2iZGDMJcAw3Z6r7oV0hWAq_zMsg,8005
|
|
8
9
|
datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
|
|
9
10
|
datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
|
|
10
|
-
datasourcelib/datasources/datasource_types.py,sha256=
|
|
11
|
+
datasourcelib/datasources/datasource_types.py,sha256=jpm4f9n1l7X9aBD58Pbr9evXiCHHEhRCLojGwchUD7A,205
|
|
12
|
+
datasourcelib/datasources/dataverse_source.py,sha256=8qScGvTvMOVeDc_ODYtBmx97L9AIlokz3wkzioT_ovw,13296
|
|
11
13
|
datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
|
|
12
|
-
datasourcelib/datasources/sharepoint_source.py,sha256=
|
|
13
|
-
datasourcelib/datasources/sql_source.py,sha256=
|
|
14
|
+
datasourcelib/datasources/sharepoint_source.py,sha256=t3rly2mVEI2qEDuUVqstck5ktkZW0BnF16Bke_NjPLI,23126
|
|
15
|
+
datasourcelib/datasources/sql_source.py,sha256=ntZjiFXpa7V797x7mAATJV0LH-g878VHuRw-QTxEe28,6372
|
|
16
|
+
datasourcelib/datasources/sql_source_bkup.py,sha256=ntZjiFXpa7V797x7mAATJV0LH-g878VHuRw-QTxEe28,6372
|
|
14
17
|
datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
|
|
15
18
|
datasourcelib/indexes/azure_search_index.py,sha256=kznAz06UXgyT1Clqj6gRhnBQ5HFw40ZQHJElRFIcbRo,22115
|
|
16
19
|
datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
|
|
@@ -25,8 +28,8 @@ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4f
|
|
|
25
28
|
datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
|
|
26
29
|
datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
|
|
27
30
|
datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
|
|
28
|
-
datasourcelib-0.1.
|
|
29
|
-
datasourcelib-0.1.
|
|
30
|
-
datasourcelib-0.1.
|
|
31
|
-
datasourcelib-0.1.
|
|
32
|
-
datasourcelib-0.1.
|
|
31
|
+
datasourcelib-0.1.6.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
|
|
32
|
+
datasourcelib-0.1.6.dist-info/METADATA,sha256=5lpuBdVreQu7PHsMoD9RWsnSx2cZjpKLEjFhclwO5oA,1199
|
|
33
|
+
datasourcelib-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
34
|
+
datasourcelib-0.1.6.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
|
|
35
|
+
datasourcelib-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|