datasourcelib 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasourcelib/core/sync_base.py +1 -1
- datasourcelib/core/sync_manager.py +33 -12
- datasourcelib/datasources/datasource_types.py +2 -1
- datasourcelib/datasources/dataverse_source.py +305 -0
- datasourcelib/datasources/sql_source_bkup.py +159 -0
- datasourcelib/strategies/daily_load.py +35 -10
- datasourcelib/strategies/full_load.py +86 -25
- datasourcelib/strategies/incremental_load.py +19 -6
- datasourcelib/strategies/ondemand_load.py +19 -5
- datasourcelib/strategies/timerange_load.py +19 -5
- datasourcelib/utils/aggregation.py +152 -0
- {datasourcelib-0.1.5.dist-info → datasourcelib-0.1.7.dist-info}/METADATA +1 -1
- {datasourcelib-0.1.5.dist-info → datasourcelib-0.1.7.dist-info}/RECORD +16 -13
- {datasourcelib-0.1.5.dist-info → datasourcelib-0.1.7.dist-info}/WHEEL +0 -0
- {datasourcelib-0.1.5.dist-info → datasourcelib-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {datasourcelib-0.1.5.dist-info → datasourcelib-0.1.7.dist-info}/top_level.txt +0 -0
datasourcelib/core/sync_base.py
CHANGED
|
@@ -10,6 +10,7 @@ from ..datasources.sql_source import SQLDataSource
|
|
|
10
10
|
from ..datasources.azure_devops_source import AzureDevOpsSource
|
|
11
11
|
from ..datasources.sharepoint_source import SharePointSource
|
|
12
12
|
from ..datasources.blob_source import BlobStorageSource
|
|
13
|
+
from ..datasources.dataverse_source import DataverseSource
|
|
13
14
|
|
|
14
15
|
# concrete strategies
|
|
15
16
|
from datasourcelib.strategies.full_load import FullLoadStrategy
|
|
@@ -35,11 +36,12 @@ class SyncManager:
|
|
|
35
36
|
DataSourceType.SQL: SQLDataSource,
|
|
36
37
|
DataSourceType.AZURE_DEVOPS: AzureDevOpsSource,
|
|
37
38
|
DataSourceType.SHAREPOINT: SharePointSource,
|
|
38
|
-
DataSourceType.BLOB_STORAGE: BlobStorageSource
|
|
39
|
+
DataSourceType.BLOB_STORAGE: BlobStorageSource,
|
|
40
|
+
DataSourceType.Dataverse: DataverseSource
|
|
39
41
|
}
|
|
40
42
|
|
|
41
|
-
def execute_sync(self, sync_type:
|
|
42
|
-
source_type:
|
|
43
|
+
def execute_sync(self, sync_type: str,
|
|
44
|
+
source_type: str,
|
|
43
45
|
source_config: Dict[str, Any],
|
|
44
46
|
vector_db_config: Dict[str, Any],
|
|
45
47
|
**kwargs) -> Dict[str, Any]:
|
|
@@ -47,6 +49,33 @@ class SyncManager:
|
|
|
47
49
|
logger.info(f"Execute {sync_type} sync using {source_type} source")
|
|
48
50
|
|
|
49
51
|
try:
|
|
52
|
+
# validate and convert sync_type and source_type to their Enum members
|
|
53
|
+
def _to_enum(enum_cls, val, label):
|
|
54
|
+
if isinstance(val, enum_cls):
|
|
55
|
+
return val
|
|
56
|
+
s = str(val)
|
|
57
|
+
# case-insensitive name match
|
|
58
|
+
for member in enum_cls:
|
|
59
|
+
if member.name.lower() == s.lower():
|
|
60
|
+
return member
|
|
61
|
+
# try by value
|
|
62
|
+
try:
|
|
63
|
+
return enum_cls(val)
|
|
64
|
+
except Exception:
|
|
65
|
+
names = ", ".join([m.name for m in enum_cls])
|
|
66
|
+
values = ", ".join([str(m.value) for m in enum_cls])
|
|
67
|
+
raise ValueError(f"Invalid {label}. Permitted names: {names}. Permitted values: {values}")
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
sync_type = _to_enum(SyncType, sync_type, "sync_type")
|
|
71
|
+
source_type = _to_enum(DataSourceType, source_type, "source_type")
|
|
72
|
+
except ValueError as ex:
|
|
73
|
+
logger.error(str(ex))
|
|
74
|
+
return {
|
|
75
|
+
"status": SyncStatus.FAILED,
|
|
76
|
+
"message": str(ex),
|
|
77
|
+
"started_at": start
|
|
78
|
+
}
|
|
50
79
|
# Get data source class
|
|
51
80
|
source_cls = self._datasource_map.get(source_type)
|
|
52
81
|
if not source_cls:
|
|
@@ -76,15 +105,7 @@ class SyncManager:
|
|
|
76
105
|
}
|
|
77
106
|
|
|
78
107
|
# Execute sync
|
|
79
|
-
|
|
80
|
-
status = SyncStatus.SUCCESS if success else SyncStatus.FAILED
|
|
81
|
-
|
|
82
|
-
return {
|
|
83
|
-
"status": status,
|
|
84
|
-
"message": f"{sync_type} completed" if success else f"{sync_type} failed",
|
|
85
|
-
"started_at": start,
|
|
86
|
-
"finished_at": datetime.utcnow()
|
|
87
|
-
}
|
|
108
|
+
return strategy.sync(**kwargs)
|
|
88
109
|
|
|
89
110
|
except Exception as ex:
|
|
90
111
|
logger.exception("SyncManager.execute_sync failed")
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
2
|
+
from datasourcelib.datasources.datasource_base import DataSourceBase
|
|
3
|
+
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
from datasourcelib.utils.validators import require_keys
|
|
5
|
+
from datasourcelib.utils.aggregation import generate_grouped_summaries
|
|
6
|
+
import pyodbc
|
|
7
|
+
import time
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
# optional requests import (webapi mode)
|
|
11
|
+
try:
|
|
12
|
+
import requests # type: ignore
|
|
13
|
+
except Exception:
|
|
14
|
+
requests = None # lazy import
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
class DataverseSource(DataSourceBase):
|
|
19
|
+
|
|
20
|
+
def __init__(self, config: Dict[str, Any]):
|
|
21
|
+
super().__init__(config)
|
|
22
|
+
self._conn = None
|
|
23
|
+
self._mode = (self.config.get("dv_mode") or "tds").lower() # "tds" or "webapi"
|
|
24
|
+
self._access_token: Optional[str] = None
|
|
25
|
+
self._headers: Dict[str, str] = {}
|
|
26
|
+
self._max_retries = int(self.config.get("dv_max_retries", 3))
|
|
27
|
+
|
|
28
|
+
def validate_config(self) -> bool:
|
|
29
|
+
"""
|
|
30
|
+
Validate required keys depending on selected dv_mode.
|
|
31
|
+
- tds: requires either 'tds_connection_string' OR ('dataverse_server' and 'dataverse_database')
|
|
32
|
+
- webapi: requires 'webapi_url','client_id','client_secret','tenant_id' (or 'resource')
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
if self._mode == "webapi":
|
|
36
|
+
require_keys(self.config, ["dv_webapi_url", "dv_webapi_client_id", "dv_webapi_client_secret", "dv_webapi_tenant_id"])
|
|
37
|
+
else:
|
|
38
|
+
# TDS mode (ODBC)
|
|
39
|
+
if "dv_tds_connection_string" in self.config:
|
|
40
|
+
return True
|
|
41
|
+
# otherwise require components
|
|
42
|
+
require_keys(self.config, ["dv_tds_server", "dv_tds_database"])
|
|
43
|
+
# if not using integrated auth require creds
|
|
44
|
+
if not bool(self.config.get("dv_tds_windows_auth", False)):
|
|
45
|
+
require_keys(self.config, ["dv_tds_username", "dv_tds_password"])
|
|
46
|
+
return True
|
|
47
|
+
except Exception as ex:
|
|
48
|
+
logger.error("DataverseSource.validate_config failed: %s", ex)
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
# -------------------------
|
|
52
|
+
# Connection helpers
|
|
53
|
+
# -------------------------
|
|
54
|
+
def _get_available_driver(self) -> str:
|
|
55
|
+
"""Return first suitable ODBC driver for SQL/Dataverse TDS access."""
|
|
56
|
+
preferred_drivers = [
|
|
57
|
+
"ODBC Driver 18 for SQL Server",
|
|
58
|
+
"ODBC Driver 17 for SQL Server",
|
|
59
|
+
"SQL Server Native Client 11.0",
|
|
60
|
+
"SQL Server"
|
|
61
|
+
]
|
|
62
|
+
try:
|
|
63
|
+
drivers = pyodbc.drivers()
|
|
64
|
+
logger.info("Available ODBC drivers: %s", drivers)
|
|
65
|
+
|
|
66
|
+
for d in preferred_drivers:
|
|
67
|
+
if d in drivers:
|
|
68
|
+
logger.info("Using ODBC driver: %s", d)
|
|
69
|
+
return d
|
|
70
|
+
|
|
71
|
+
# fallback to first available
|
|
72
|
+
if drivers:
|
|
73
|
+
logger.warning("No preferred driver found. Using: %s", drivers[0])
|
|
74
|
+
return drivers[0]
|
|
75
|
+
raise RuntimeError("No ODBC drivers available")
|
|
76
|
+
except Exception as ex:
|
|
77
|
+
logger.error("DataverseSource._get_available_driver failed: %s", ex)
|
|
78
|
+
raise
|
|
79
|
+
|
|
80
|
+
def _build_tds_conn_str(self) -> str:
|
|
81
|
+
"""Build valid connection string with proper parameter names."""
|
|
82
|
+
if "dv_tds_connection_string" in self.config:
|
|
83
|
+
return self.config["dv_tds_connection_string"]
|
|
84
|
+
|
|
85
|
+
driver = self._get_available_driver()
|
|
86
|
+
# Fix: use correct config key names (dv_tds_server, not dv_tds_dataverse_server)
|
|
87
|
+
server = self.config.get("dv_tds_server", "").strip()
|
|
88
|
+
database = self.config.get("dv_tds_database", "").strip()
|
|
89
|
+
|
|
90
|
+
if not server:
|
|
91
|
+
raise ValueError("dv_tds_server are required")
|
|
92
|
+
|
|
93
|
+
logger.info("Building TDS connection (driver=%s, server=%s, database=%s)", driver, server, database)
|
|
94
|
+
|
|
95
|
+
# Use curly braces for driver name (handles spaces in driver names)
|
|
96
|
+
parts = [f"DRIVER={{{driver}}}"]
|
|
97
|
+
parts.append(f"Server={server}")
|
|
98
|
+
parts.append(f"Database={database}")
|
|
99
|
+
password = None
|
|
100
|
+
if bool(self.config.get("dv_tds_windows_auth", False)):
|
|
101
|
+
parts.append("Trusted_Connection=yes")
|
|
102
|
+
logger.info("Using Windows authentication")
|
|
103
|
+
else:
|
|
104
|
+
username = self.config.get("dv_tds_username", "").strip()
|
|
105
|
+
password = self.config.get("dv_tds_password", "").strip()
|
|
106
|
+
|
|
107
|
+
if not username or not password:
|
|
108
|
+
raise ValueError("dv_tds_username and dv_tds_password required when Windows auth disabled")
|
|
109
|
+
|
|
110
|
+
parts.append(f"UID={username}")
|
|
111
|
+
parts.append(f"PWD={password}")
|
|
112
|
+
parts.append("Authentication=ActiveDirectoryInteractive")
|
|
113
|
+
# Encryption settings
|
|
114
|
+
if not bool(self.config.get("dv_tds_is_onprem", False)):
|
|
115
|
+
parts.append("Encrypt=yes")
|
|
116
|
+
parts.append("TrustServerCertificate=no")
|
|
117
|
+
else:
|
|
118
|
+
parts.append("Encrypt=optional")
|
|
119
|
+
parts.append("TrustServerCertificate=yes")
|
|
120
|
+
|
|
121
|
+
conn_str = ";".join(parts)
|
|
122
|
+
logger.debug("Connection string: %s", conn_str.replace(password or "", "***") if password else conn_str)
|
|
123
|
+
return conn_str
|
|
124
|
+
|
|
125
|
+
def _obtain_webapi_token(self) -> Tuple[str, Dict[str, str]]:
|
|
126
|
+
"""
|
|
127
|
+
Acquire OAuth2 token using client credentials flow.
|
|
128
|
+
Returns (access_token, headers)
|
|
129
|
+
Config expected keys: tenant_id, client_id, client_secret, optional resource
|
|
130
|
+
"""
|
|
131
|
+
if requests is None:
|
|
132
|
+
raise RuntimeError("requests package required for Dataverse Web API mode")
|
|
133
|
+
tenant = self.config["dv_webapi_tenant_id"]
|
|
134
|
+
client_id = self.config["dv_webapi_client_id"]
|
|
135
|
+
client_secret = self.config["dv_webapi_client_secret"]
|
|
136
|
+
# resource or scope: prefer explicit resource, else fallback to webapi_url host
|
|
137
|
+
resource = self.config.get("dv_webapi_resource")
|
|
138
|
+
if not resource:
|
|
139
|
+
# infer resource from webapi_url e.g. https://<org>.crm.dynamics.com
|
|
140
|
+
webapi_url = self.config["dv_webapi_url"].rstrip("/")
|
|
141
|
+
resource = webapi_url.split("://")[-1]
|
|
142
|
+
resource = f"https://{resource}" # as resource
|
|
143
|
+
token_url = f"https://login.microsoftonline.com/{tenant}/oauth2/v2.0/token"
|
|
144
|
+
data = {
|
|
145
|
+
"grant_type": "client_credentials",
|
|
146
|
+
"client_id": client_id,
|
|
147
|
+
"client_secret": client_secret,
|
|
148
|
+
"scope": f"{resource}/.default"
|
|
149
|
+
}
|
|
150
|
+
resp = requests.post(token_url, data=data, timeout=30)
|
|
151
|
+
resp.raise_for_status()
|
|
152
|
+
j = resp.json()
|
|
153
|
+
token = j.get("access_token")
|
|
154
|
+
if not token:
|
|
155
|
+
raise RuntimeError("Failed to obtain access token for Dataverse webapi")
|
|
156
|
+
headers = {"Authorization": f"Bearer {token}", "Accept": "application/json", "OData-MaxVersion": "4.0", "OData-Version": "4.0"}
|
|
157
|
+
return token, headers
|
|
158
|
+
|
|
159
|
+
# -------------------------
|
|
160
|
+
# Public connection API
|
|
161
|
+
# -------------------------
|
|
162
|
+
def connect(self) -> bool:
|
|
163
|
+
try:
|
|
164
|
+
if self._mode == "webapi":
|
|
165
|
+
token, headers = self._obtain_webapi_token()
|
|
166
|
+
self._access_token = token
|
|
167
|
+
self._headers = headers
|
|
168
|
+
self._connected = True
|
|
169
|
+
logger.info("DataverseSource connected (webapi mode) to %s", self.config.get("dv_webapi_url"))
|
|
170
|
+
return True
|
|
171
|
+
# else TDS mode
|
|
172
|
+
conn_str = self._build_tds_conn_str()
|
|
173
|
+
self._conn = pyodbc.connect(conn_str, timeout=int(self.config.get("dv_tds_timeout", 30)))
|
|
174
|
+
self._connected = True
|
|
175
|
+
logger.info("DataverseSource connected (dv_tds mode) to %s/%s", self.config.get("dv_server"), self.config.get("dv_database"))
|
|
176
|
+
return True
|
|
177
|
+
except pyodbc.Error as ex:
|
|
178
|
+
logger.error("DataverseSource.connect failed - ODBC Error: %s", ex)
|
|
179
|
+
self._connected = False
|
|
180
|
+
return False
|
|
181
|
+
except requests.RequestException as ex:
|
|
182
|
+
logger.error("DataverseSource.connect failed - HTTP Error: %s", ex)
|
|
183
|
+
self._connected = False
|
|
184
|
+
return False
|
|
185
|
+
except Exception as ex:
|
|
186
|
+
logger.exception("DataverseSource.connect failed")
|
|
187
|
+
self._connected = False
|
|
188
|
+
return False
|
|
189
|
+
|
|
190
|
+
def disconnect(self) -> None:
|
|
191
|
+
try:
|
|
192
|
+
if self._conn:
|
|
193
|
+
try:
|
|
194
|
+
self._conn.close()
|
|
195
|
+
except Exception:
|
|
196
|
+
pass
|
|
197
|
+
self._conn = None
|
|
198
|
+
self._access_token = None
|
|
199
|
+
self._headers = {}
|
|
200
|
+
finally:
|
|
201
|
+
self._connected = False
|
|
202
|
+
logger.info("DataverseSource disconnected")
|
|
203
|
+
|
|
204
|
+
# -------------------------
|
|
205
|
+
# Data fetch
|
|
206
|
+
# -------------------------
|
|
207
|
+
def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
|
|
208
|
+
"""
|
|
209
|
+
Fetch rows from Dataverse.
|
|
210
|
+
- TDS mode: executes SQL query (config key 'tds_query' or provided 'query')
|
|
211
|
+
- WebAPI mode: calls Dataverse Web API path fragment (e.g. 'accounts?$select=name') or uses 'entity_set' + query params
|
|
212
|
+
Returns list[dict].
|
|
213
|
+
"""
|
|
214
|
+
attempt = 0
|
|
215
|
+
while attempt < self._max_retries:
|
|
216
|
+
try:
|
|
217
|
+
if not getattr(self, "_connected", False):
|
|
218
|
+
ok = self.connect()
|
|
219
|
+
if not ok:
|
|
220
|
+
raise RuntimeError("DataverseSource: cannot connect")
|
|
221
|
+
|
|
222
|
+
if self._mode == "webapi":
|
|
223
|
+
if requests is None:
|
|
224
|
+
raise RuntimeError("requests package required for webapi mode")
|
|
225
|
+
webapi_url = self.config["dv_webapi_url"].rstrip("/")
|
|
226
|
+
# if query provided, treat it as path fragment; else use entity_set from config
|
|
227
|
+
path_fragment = query or self.config.get("dv_webapi_entity_set")
|
|
228
|
+
if not path_fragment:
|
|
229
|
+
raise ValueError("DataverseSource.fetch_data requires a webapi 'query' or 'entity_set' in config")
|
|
230
|
+
url = f"{webapi_url}/api/data/v9.1/{path_fragment.lstrip('/')}"
|
|
231
|
+
params = kwargs.get("params")
|
|
232
|
+
resp = requests.get(url, headers=self._headers, params=params, timeout=60)
|
|
233
|
+
resp.raise_for_status()
|
|
234
|
+
j = resp.json()
|
|
235
|
+
items: Any = []
|
|
236
|
+
# Dataverse OData responses typically use 'value' for collections
|
|
237
|
+
if isinstance(j, dict) and "value" in j:
|
|
238
|
+
items = j["value"]
|
|
239
|
+
# otherwise return the raw json wrapped in a list or as-is
|
|
240
|
+
elif isinstance(j, list):
|
|
241
|
+
items= j
|
|
242
|
+
else:
|
|
243
|
+
items= [j]
|
|
244
|
+
|
|
245
|
+
df = pd.DataFrame(items)
|
|
246
|
+
# filter columns if configured
|
|
247
|
+
keep = self.config.get("dv_webapi_columns_to_keep")
|
|
248
|
+
if isinstance(keep, list) and keep:
|
|
249
|
+
cols_to_keep = [c for c in df.columns if c in keep]
|
|
250
|
+
else:
|
|
251
|
+
# exclude SharePoint metadata columns (start with '__' or prefixed with '@')
|
|
252
|
+
cols_to_keep = [c for c in df.columns if not str(c).startswith("__") and not str(c).startswith("@")]
|
|
253
|
+
df = df[cols_to_keep]
|
|
254
|
+
|
|
255
|
+
results = df.to_dict("records")
|
|
256
|
+
return results
|
|
257
|
+
# else TDS mode
|
|
258
|
+
sql = query or self.config.get("dv_tds_query") or self.config.get("dv_sql_query")
|
|
259
|
+
if not sql:
|
|
260
|
+
raise ValueError("DataverseSource.fetch_data requires a SQL query (tds mode)")
|
|
261
|
+
|
|
262
|
+
cur = self._conn.cursor()
|
|
263
|
+
try:
|
|
264
|
+
cur.execute(sql)
|
|
265
|
+
cols = [c[0] for c in (cur.description or [])]
|
|
266
|
+
rows = cur.fetchall()
|
|
267
|
+
results: List[Dict[str, Any]] = []
|
|
268
|
+
for r in rows:
|
|
269
|
+
results.append({cols[i]: r[i] for i in range(len(cols))})
|
|
270
|
+
|
|
271
|
+
df = pd.DataFrame(results)
|
|
272
|
+
summaries = generate_grouped_summaries(
|
|
273
|
+
df=df,
|
|
274
|
+
aggregation_field=self.config.get("dv_tds_aggregation_field"),
|
|
275
|
+
row_format=self.config.get("dv_tds_row_format"),
|
|
276
|
+
constants={"title": ""},
|
|
277
|
+
header_format=self.config.get("dv_tds_header_format"),
|
|
278
|
+
sort_by=self.config.get("dv_tds_sort_by"), # or a column/list if you want ordering
|
|
279
|
+
validate=True # ensures all placeholders exist
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
return summaries
|
|
283
|
+
finally:
|
|
284
|
+
try:
|
|
285
|
+
cur.close()
|
|
286
|
+
except Exception:
|
|
287
|
+
pass
|
|
288
|
+
|
|
289
|
+
except Exception as ex:
|
|
290
|
+
attempt += 1
|
|
291
|
+
logger.warning("DataverseSource.fetch_data attempt %d/%d failed: %s", attempt, self._max_retries, ex)
|
|
292
|
+
# transient retry for network/connection errors
|
|
293
|
+
if attempt >= self._max_retries:
|
|
294
|
+
logger.exception("DataverseSource.fetch_data final failure")
|
|
295
|
+
raise
|
|
296
|
+
# backoff
|
|
297
|
+
time.sleep(min(2 ** attempt, 10))
|
|
298
|
+
# try reconnect for next attempt
|
|
299
|
+
try:
|
|
300
|
+
self.disconnect()
|
|
301
|
+
except Exception:
|
|
302
|
+
pass
|
|
303
|
+
|
|
304
|
+
# unreachable; defensive
|
|
305
|
+
return []
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
from datasourcelib.datasources.datasource_base import DataSourceBase
|
|
3
|
+
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
from datasourcelib.utils.validators import require_keys
|
|
5
|
+
import os
|
|
6
|
+
import pyodbc
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
class SQLDataSource(DataSourceBase):
|
|
12
|
+
|
|
13
|
+
def __init__(self, config: Dict[str, Any]):
|
|
14
|
+
super().__init__(config)
|
|
15
|
+
self._conn = None
|
|
16
|
+
self._is_sqlite = False
|
|
17
|
+
|
|
18
|
+
def validate_config(self) -> bool:
|
|
19
|
+
"""
|
|
20
|
+
Validate config. If sql_windows_auth is True then sql_username/sql_password are optional.
|
|
21
|
+
Otherwise require sql_username and sql_password.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
# Always require server/database at minimum
|
|
25
|
+
require_keys(self.config, ["sql_server", "sql_database"])
|
|
26
|
+
# If not using Windows authentication, require credentials
|
|
27
|
+
if not bool(self.config.get("sql_windows_auth", False)):
|
|
28
|
+
require_keys(self.config, ["sql_username", "sql_password"])
|
|
29
|
+
return True
|
|
30
|
+
except Exception as ex:
|
|
31
|
+
logger.error("SQLDataSource.validate_config: %s", ex)
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
def connect(self) -> bool:
|
|
35
|
+
try:
|
|
36
|
+
sql_server = self.config.get("sql_server", "")
|
|
37
|
+
sql_database = self.config.get("sql_database", "")
|
|
38
|
+
sql_is_onprem = self.config.get("sql_is_onprem", False)
|
|
39
|
+
|
|
40
|
+
# Determine auth mode: sql_windows_auth (Trusted Connection) overrides username/password
|
|
41
|
+
sql_windows_auth = bool(self.config.get("sql_windows_auth", False))
|
|
42
|
+
|
|
43
|
+
# Get available driver
|
|
44
|
+
sql_driver = self._get_available_driver()
|
|
45
|
+
|
|
46
|
+
# Build connection string
|
|
47
|
+
conn_params = [
|
|
48
|
+
f'DRIVER={sql_driver}',
|
|
49
|
+
f'SERVER={sql_server}',
|
|
50
|
+
f'DATABASE={sql_database}',
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
if sql_windows_auth:
|
|
54
|
+
# Use integrated Windows authentication (Trusted Connection)
|
|
55
|
+
# This will use the current process credentials / kerberos ticket.
|
|
56
|
+
conn_params.append('Trusted_Connection=yes')
|
|
57
|
+
logger.info("SQLDataSource using Windows (integrated) authentication")
|
|
58
|
+
else:
|
|
59
|
+
sql_username = self.config.get("sql_username", "")
|
|
60
|
+
sql_password = self.config.get("sql_password", "")
|
|
61
|
+
conn_params.extend([f'UID={sql_username}', f'PWD={sql_password}'])
|
|
62
|
+
|
|
63
|
+
# Add encryption settings based on environment
|
|
64
|
+
if not sql_is_onprem:
|
|
65
|
+
conn_params.extend([
|
|
66
|
+
'Encrypt=yes',
|
|
67
|
+
'TrustServerCertificate=no'
|
|
68
|
+
])
|
|
69
|
+
else:
|
|
70
|
+
conn_params.extend([
|
|
71
|
+
'Encrypt=optional',
|
|
72
|
+
'TrustServerCertificate=yes'
|
|
73
|
+
])
|
|
74
|
+
|
|
75
|
+
conn_str = ';'.join(conn_params)
|
|
76
|
+
|
|
77
|
+
# Attempt connection with timeout
|
|
78
|
+
self._conn = pyodbc.connect(conn_str, timeout=30)
|
|
79
|
+
self._connected = True
|
|
80
|
+
logger.info("SQLDataSource connected to %s using driver %s (sql_windows_auth=%s)", sql_server, sql_driver, sql_windows_auth)
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
except pyodbc.Error as ex:
|
|
84
|
+
logger.error("SQLDataSource.connect failed - ODBC Error: %s", ex)
|
|
85
|
+
self._connected = False
|
|
86
|
+
return False
|
|
87
|
+
except Exception as ex:
|
|
88
|
+
logger.error("SQLDataSource.connect failed - Unexpected Error: %s", ex)
|
|
89
|
+
self._connected = False
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
def disconnect(self) -> None:
|
|
93
|
+
try:
|
|
94
|
+
if self._conn:
|
|
95
|
+
self._conn.close()
|
|
96
|
+
finally:
|
|
97
|
+
self._conn = None
|
|
98
|
+
self._connected = False
|
|
99
|
+
logger.info("SQLDataSource disconnected")
|
|
100
|
+
|
|
101
|
+
def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
|
|
102
|
+
max_retries = 3
|
|
103
|
+
retry_count = 0
|
|
104
|
+
|
|
105
|
+
while retry_count < max_retries:
|
|
106
|
+
try:
|
|
107
|
+
if not self._connected:
|
|
108
|
+
ok = self.connect()
|
|
109
|
+
if not ok:
|
|
110
|
+
raise RuntimeError("SQLDataSource: not connected and cannot connect")
|
|
111
|
+
|
|
112
|
+
query = self.config.get("sql_query")
|
|
113
|
+
if not query:
|
|
114
|
+
raise ValueError("SQLDataSource.fetch_data requires a query")
|
|
115
|
+
|
|
116
|
+
cur = self._conn.cursor()
|
|
117
|
+
try:
|
|
118
|
+
cur.execute(query)
|
|
119
|
+
cols = [d[0] if hasattr(d, "__len__") else d[0] for d in (cur.description or [])]
|
|
120
|
+
rows = cur.fetchall()
|
|
121
|
+
results: List[Dict[str, Any]] = []
|
|
122
|
+
for r in rows:
|
|
123
|
+
results.append({cols[i]: r[i] for i in range(len(cols))})
|
|
124
|
+
return results
|
|
125
|
+
finally:
|
|
126
|
+
try:
|
|
127
|
+
cur.close()
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
except pyodbc.OperationalError as ex:
|
|
132
|
+
# Handle connection lost
|
|
133
|
+
retry_count += 1
|
|
134
|
+
logger.warning("Connection lost, attempt %d of %d: %s", retry_count, max_retries, ex)
|
|
135
|
+
self.disconnect()
|
|
136
|
+
if retry_count >= max_retries:
|
|
137
|
+
raise
|
|
138
|
+
except Exception as ex:
|
|
139
|
+
logger.error("Query execution failed: %s", ex)
|
|
140
|
+
raise
|
|
141
|
+
|
|
142
|
+
def _get_available_driver(self) -> str:
|
|
143
|
+
"""Get first available SQL Server driver from preferred list."""
|
|
144
|
+
preferred_drivers = [
|
|
145
|
+
'ODBC Driver 18 for SQL Server',
|
|
146
|
+
'ODBC Driver 17 for SQL Server',
|
|
147
|
+
'SQL Server Native Client 11.0',
|
|
148
|
+
'SQL Server'
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
available_drivers = pyodbc.drivers()
|
|
153
|
+
for driver in preferred_drivers:
|
|
154
|
+
if driver in available_drivers:
|
|
155
|
+
return driver
|
|
156
|
+
raise RuntimeError(f"No suitable SQL Server driver found. Available drivers: {available_drivers}")
|
|
157
|
+
except Exception as ex:
|
|
158
|
+
logger.error("Failed to get SQL drivers: %s", ex)
|
|
159
|
+
raise
|
|
@@ -1,22 +1,47 @@
|
|
|
1
1
|
from datasourcelib.core.sync_base import SyncBase
|
|
2
2
|
from datasourcelib.utils.logger import get_logger
|
|
3
|
-
from datetime import datetime,
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import Dict, Any, Optional
|
|
4
5
|
|
|
5
6
|
logger = get_logger(__name__)
|
|
6
7
|
|
|
7
8
|
class DailyLoadStrategy(SyncBase):
|
|
8
|
-
"""Daily scheduled load (wraps incremental)."""
|
|
9
|
+
"""Daily scheduled load strategy (wraps incremental sync)."""
|
|
9
10
|
|
|
10
11
|
def validate(self) -> bool:
|
|
12
|
+
"""Validate strategy preconditions."""
|
|
11
13
|
return True
|
|
12
14
|
|
|
13
|
-
def sync(self, run_date: str = None, **kwargs) ->
|
|
15
|
+
def sync(self, run_date: Optional[str] = None, **kwargs) -> Dict[str, Any]:
|
|
16
|
+
"""
|
|
17
|
+
Execute daily load for the given run_date (ISO date string).
|
|
18
|
+
If run_date is None, today's UTC date is used.
|
|
19
|
+
|
|
20
|
+
Returns a dict with status, message and ISO timestamps.
|
|
21
|
+
"""
|
|
22
|
+
# Ensure run_date and started_at exist even if exceptions occur early
|
|
23
|
+
run_date = run_date
|
|
24
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
14
25
|
try:
|
|
15
|
-
run_date = run_date or datetime.
|
|
16
|
-
logger.info("Starting daily load for %s", run_date)
|
|
17
|
-
|
|
18
|
-
# TODO
|
|
19
|
-
|
|
20
|
-
|
|
26
|
+
run_date = run_date or datetime.now(timezone.utc).date().isoformat()
|
|
27
|
+
logger.info("Starting daily load for %s (requested run_date=%s)", started_at, run_date)
|
|
28
|
+
|
|
29
|
+
# TODO: call incremental sync / processing here, for example:
|
|
30
|
+
# result = self.incremental_sync(last_sync=..., **kwargs)
|
|
31
|
+
|
|
32
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
33
|
+
return {
|
|
34
|
+
"status": "success",
|
|
35
|
+
"message": f"Daily load completed for {run_date}",
|
|
36
|
+
"started_at": started_at,
|
|
37
|
+
"finished_at": finished_at
|
|
38
|
+
}
|
|
39
|
+
except Exception as ex:
|
|
21
40
|
logger.exception("DailyLoadStrategy.sync failed")
|
|
22
|
-
|
|
41
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
42
|
+
return {
|
|
43
|
+
"status": "failure",
|
|
44
|
+
"message": f"Exception: {ex}",
|
|
45
|
+
"started_at": started_at,
|
|
46
|
+
"finished_at": finished_at
|
|
47
|
+
}
|
|
@@ -1,38 +1,99 @@
|
|
|
1
|
+
from typing import Dict, Any
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
|
|
1
4
|
from datasourcelib.core.sync_base import SyncBase
|
|
2
5
|
from datasourcelib.utils.logger import get_logger
|
|
3
6
|
from datasourcelib.indexes.azure_search_index import AzureSearchIndexer
|
|
7
|
+
|
|
4
8
|
logger = get_logger(__name__)
|
|
5
9
|
|
|
10
|
+
|
|
6
11
|
class FullLoadStrategy(SyncBase):
|
|
7
12
|
"""Full load: replace or reload entire source into vector DB."""
|
|
8
13
|
|
|
9
14
|
def validate(self) -> bool:
|
|
10
|
-
# Minimal validation: required keys exist
|
|
11
|
-
|
|
12
|
-
|
|
15
|
+
# Minimal validation: required keys exist on datasource
|
|
16
|
+
try:
|
|
17
|
+
return bool(self.data_source and self.data_source.validate_config())
|
|
18
|
+
except Exception:
|
|
19
|
+
logger.exception("FullLoadStrategy.validate failed")
|
|
20
|
+
return False
|
|
13
21
|
|
|
14
|
-
def sync(self, **kwargs) ->
|
|
22
|
+
def sync(self, **kwargs) -> Dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Execute full load: read data from data_source and index into vector DB (Azure Search).
|
|
25
|
+
Returns a dict with status, message and ISO timestamps.
|
|
26
|
+
"""
|
|
27
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
15
28
|
try:
|
|
16
|
-
logger.info("Running full data load")
|
|
29
|
+
logger.info("Running full data load (started_at=%s)", started_at)
|
|
30
|
+
|
|
31
|
+
# Fetch data from configured data source
|
|
17
32
|
data = self.data_source.fetch_data(**kwargs)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
#
|
|
24
|
-
if isinstance(data, list)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
33
|
+
|
|
34
|
+
# Log kwargs for debugging at debug level
|
|
35
|
+
if kwargs:
|
|
36
|
+
logger.debug("FullLoadStrategy.sync kwargs: %s", kwargs)
|
|
37
|
+
|
|
38
|
+
# If no data returned, finish gracefully
|
|
39
|
+
total_records = len(data) if isinstance(data, (list, tuple)) else (1 if data is not None else 0)
|
|
40
|
+
if total_records == 0:
|
|
41
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
42
|
+
msg = "No records returned from data source"
|
|
43
|
+
logger.info(msg)
|
|
44
|
+
return {
|
|
45
|
+
"status": "success",
|
|
46
|
+
"message": msg,
|
|
47
|
+
"started_at": started_at,
|
|
48
|
+
"finished_at": finished_at,
|
|
49
|
+
"loaded_records": 0
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Use AzureSearchIndexer to create index and upload documents if requested
|
|
53
|
+
indexer = AzureSearchIndexer(self.vector_db_config or {})
|
|
54
|
+
if not indexer.validate_config():
|
|
55
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
56
|
+
msg = "Vector DB config invalid for Azure Search indexer"
|
|
57
|
+
logger.error(msg)
|
|
58
|
+
return {
|
|
59
|
+
"status": "failure",
|
|
60
|
+
"message": msg,
|
|
61
|
+
"started_at": started_at,
|
|
62
|
+
"finished_at": finished_at,
|
|
63
|
+
"loaded_records": 0
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
ok = indexer.index(data)
|
|
67
|
+
if not ok:
|
|
68
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
69
|
+
msg = "Indexing data to Azure Search failed"
|
|
70
|
+
logger.error(msg)
|
|
71
|
+
return {
|
|
72
|
+
"status": "failure",
|
|
73
|
+
"message": msg,
|
|
74
|
+
"started_at": started_at,
|
|
75
|
+
"finished_at": finished_at,
|
|
76
|
+
"loaded_records": total_records
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
80
|
+
msg = f"Full load completed. Loaded {total_records} records."
|
|
81
|
+
logger.info("Full data load finished successfully (%s)", msg)
|
|
82
|
+
return {
|
|
83
|
+
"status": "success",
|
|
84
|
+
"message": msg,
|
|
85
|
+
"started_at": started_at,
|
|
86
|
+
"finished_at": finished_at,
|
|
87
|
+
"loaded_records": total_records
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
except Exception as ex:
|
|
37
91
|
logger.exception("FullLoadStrategy.sync failed")
|
|
38
|
-
|
|
92
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
93
|
+
return {
|
|
94
|
+
"status": "failure",
|
|
95
|
+
"message": f"Exception: {ex}",
|
|
96
|
+
"started_at": started_at,
|
|
97
|
+
"finished_at": finished_at,
|
|
98
|
+
"loaded_records": 0
|
|
99
|
+
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from datetime import datetime
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
2
|
from datasourcelib.core.sync_base import SyncBase
|
|
3
3
|
from datasourcelib.utils.logger import get_logger
|
|
4
|
-
|
|
4
|
+
from typing import Dict, Any
|
|
5
5
|
logger = get_logger(__name__)
|
|
6
6
|
|
|
7
7
|
class IncrementalLoadStrategy(SyncBase):
|
|
@@ -14,14 +14,27 @@ class IncrementalLoadStrategy(SyncBase):
|
|
|
14
14
|
return False
|
|
15
15
|
return True
|
|
16
16
|
|
|
17
|
-
def sync(self, last_sync: str = None, **kwargs) ->
|
|
17
|
+
def sync(self, last_sync: str = None, **kwargs) -> Dict[str, Any]:
|
|
18
18
|
try:
|
|
19
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
19
20
|
last = last_sync or self.source_config.get("last_sync")
|
|
20
21
|
logger.info("Running incremental load since %s", last)
|
|
21
22
|
# TODO: fetch delta rows since 'last' and upsert to vector DB
|
|
22
23
|
# After successful run store new last_sync timestamp
|
|
23
24
|
logger.info("Incremental load completed")
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
26
|
+
return {
|
|
27
|
+
"status": "success",
|
|
28
|
+
"message": f"Incremental load completed since {last}",
|
|
29
|
+
"started_at": started_at,
|
|
30
|
+
"finished_at": finished_at
|
|
31
|
+
}
|
|
32
|
+
except Exception as ex:
|
|
26
33
|
logger.exception("IncrementalLoadStrategy.sync failed")
|
|
27
|
-
|
|
34
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
35
|
+
return {
|
|
36
|
+
"status": "failure",
|
|
37
|
+
"message": f"Exception: {ex}",
|
|
38
|
+
"started_at": started_at,
|
|
39
|
+
"finished_at": finished_at
|
|
40
|
+
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from datasourcelib.core.sync_base import SyncBase
|
|
2
2
|
from datasourcelib.utils.logger import get_logger
|
|
3
|
-
|
|
3
|
+
from typing import Dict, Any
|
|
4
|
+
from datetime import datetime, timezone
|
|
4
5
|
logger = get_logger(__name__)
|
|
5
6
|
|
|
6
7
|
class OnDemandLoadStrategy(SyncBase):
|
|
@@ -9,11 +10,24 @@ class OnDemandLoadStrategy(SyncBase):
|
|
|
9
10
|
def validate(self) -> bool:
|
|
10
11
|
return True
|
|
11
12
|
|
|
12
|
-
def sync(self, **kwargs) ->
|
|
13
|
+
def sync(self, **kwargs) -> Dict[str, Any]:
|
|
13
14
|
try:
|
|
15
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
14
16
|
logger.info("On-demand sync invoked with params: %s", kwargs)
|
|
15
17
|
# Use kwargs to drive partial loads, filters, ids etc.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
19
|
+
return {
|
|
20
|
+
"status": "success",
|
|
21
|
+
"message": f"Ondemand load completed.",
|
|
22
|
+
"started_at": started_at,
|
|
23
|
+
"finished_at": finished_at
|
|
24
|
+
}
|
|
25
|
+
except Exception as ex:
|
|
18
26
|
logger.exception("OnDemandLoadStrategy.sync failed")
|
|
19
|
-
|
|
27
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
28
|
+
return {
|
|
29
|
+
"status": "failure",
|
|
30
|
+
"message": f"Exception: {ex}",
|
|
31
|
+
"started_at": started_at,
|
|
32
|
+
"finished_at": finished_at
|
|
33
|
+
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from datetime import datetime
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
2
|
from datasourcelib.core.sync_base import SyncBase
|
|
3
3
|
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
from typing import Dict, Any
|
|
4
5
|
|
|
5
6
|
logger = get_logger(__name__)
|
|
6
7
|
|
|
@@ -11,14 +12,27 @@ class TimeRangeLoadStrategy(SyncBase):
|
|
|
11
12
|
# rely on params at runtime; minimal validation OK
|
|
12
13
|
return True
|
|
13
14
|
|
|
14
|
-
def sync(self, start: str = None, end: str = None, **kwargs) ->
|
|
15
|
+
def sync(self, start: str = None, end: str = None, **kwargs) -> Dict[str, Any]:
|
|
15
16
|
try:
|
|
17
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
16
18
|
if not start or not end:
|
|
17
19
|
logger.error("TimeRangeLoadStrategy requires 'start' and 'end'")
|
|
18
20
|
return False
|
|
19
21
|
logger.info("Time range load between %s and %s", start, end)
|
|
20
22
|
# TODO: query source for timeframe and upsert
|
|
21
|
-
|
|
22
|
-
|
|
23
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
24
|
+
return {
|
|
25
|
+
"status": "success",
|
|
26
|
+
"message": f"TimeRange load completed between {start} and {end}",
|
|
27
|
+
"started_at": started_at,
|
|
28
|
+
"finished_at": finished_at
|
|
29
|
+
}
|
|
30
|
+
except Exception as ex:
|
|
23
31
|
logger.exception("TimeRangeLoadStrategy.sync failed")
|
|
24
|
-
|
|
32
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
33
|
+
return {
|
|
34
|
+
"status": "failure",
|
|
35
|
+
"message": f"Exception: {ex}",
|
|
36
|
+
"started_at": started_at,
|
|
37
|
+
"finished_at": finished_at
|
|
38
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from string import Formatter
|
|
4
|
+
from typing import Iterable, Any, Dict, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
def _placeholders(fmt: str) -> List[str]:
|
|
7
|
+
"""
|
|
8
|
+
Extract top-level placeholder names from a format string.
|
|
9
|
+
e.g., 'Number {i} is {fname}' -> ['i', ' """
|
|
10
|
+
return [field_name for _, field_name, _, _ in Formatter().parse(fmt) if field_name]
|
|
11
|
+
|
|
12
|
+
def _safe_str(x) -> str:
|
|
13
|
+
return "" if pd.isna(x) else str(x).strip()
|
|
14
|
+
|
|
15
|
+
def generate_grouped_summaries(
|
|
16
|
+
df: pd.DataFrame,
|
|
17
|
+
aggregation_field: str,
|
|
18
|
+
row_format: str,
|
|
19
|
+
*,
|
|
20
|
+
header_format: str = "{group_value} has {count} record{plural}.",
|
|
21
|
+
constants: Optional[Dict[str, Union[str, int, float]]] = None,
|
|
22
|
+
drop_empty_groups: bool = True,
|
|
23
|
+
sort_by: Optional[Union[str, Iterable[str]]] = None,
|
|
24
|
+
validate: bool = True
|
|
25
|
+
) -> List[Dict[str, Any]]:
|
|
26
|
+
"""
|
|
27
|
+
Build grouped summaries strictly when `aggregation_field` exists in `df` and is non-empty.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
df : pd.DataFrame
|
|
32
|
+
Source dataset.
|
|
33
|
+
aggregation_field : str
|
|
34
|
+
Column name to group by. Must exist in `df`.
|
|
35
|
+
row_format : str
|
|
36
|
+
Format string applied per row within a group.
|
|
37
|
+
You may use placeholders for any df columns, plus:
|
|
38
|
+
- {i}: 1-based sequence number within group
|
|
39
|
+
- constants you provide (e.g., {title_prefix})
|
|
40
|
+
headertr, optional
|
|
41
|
+
Format string for group headers. Available placeholders:
|
|
42
|
+
- {group_value}: the group key
|
|
43
|
+
- {count}: number of rows in the group
|
|
44
|
+
- {plural}: '' when count==1 else 's'
|
|
45
|
+
Default: "{group_value} has {count} record{plural}."
|
|
46
|
+
constants : dict, optional
|
|
47
|
+
Additional fixed values to be merged into each row's format context.
|
|
48
|
+
Example: {"title_prefix": "Mr"}
|
|
49
|
+
drop_empty_groups : bool, optional
|
|
50
|
+
If True, rows with blank/empty group values are discarded before grouping.
|
|
51
|
+
sort_by : str | Iterable[str] | None, optional
|
|
52
|
+
If provided, sorts rows within each group by these columns before formatting.
|
|
53
|
+
validate : bool, optional
|
|
54
|
+
If True, checks that all placeholders used in `row_format` and `header_format`
|
|
55
|
+
are available (in df columns or computed context). Raises ValueError if missing.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
List[str]
|
|
60
|
+
One formatted string per group (header + row lines joined with spaces).
|
|
61
|
+
|
|
62
|
+
Raises
|
|
63
|
+
------
|
|
64
|
+
ValueError
|
|
65
|
+
- If `aggregation_field` is missing or empty
|
|
66
|
+
- If no non-empty values exist for `aggregation_field` (with drop_empty_groups=True)
|
|
67
|
+
- If required placeholders are missing when `validate=True`
|
|
68
|
+
KeyError
|
|
69
|
+
- If columns referenced in `sort_by` are missing
|
|
70
|
+
"""
|
|
71
|
+
# Basic checks
|
|
72
|
+
if df.empty:
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
agg_field = (aggregation_field or "").strip()
|
|
76
|
+
if not agg_field:
|
|
77
|
+
return df.to_dict("records")
|
|
78
|
+
if agg_field not in df.columns:
|
|
79
|
+
raise ValueError(f"aggregation_field '{agg_field}' not found in DataFrame columns: {list(df.columns)}")
|
|
80
|
+
|
|
81
|
+
# Prepare working frame
|
|
82
|
+
working = df.copy()
|
|
83
|
+
working[agg_field] = working[agg_field].astype(str).str.strip()
|
|
84
|
+
|
|
85
|
+
if drop_empty_groups:
|
|
86
|
+
working = working[working[agg_field].astype(bool)]
|
|
87
|
+
|
|
88
|
+
if working.empty:
|
|
89
|
+
raise ValueError(f"No rows with non-empty values found for aggregation_field '{agg_field}'.")
|
|
90
|
+
|
|
91
|
+
# Optional sort within groups
|
|
92
|
+
if sort_by is not None:
|
|
93
|
+
sort_cols = [sort_by] if isinstance(sort_by, str) else list(sort_by)
|
|
94
|
+
missing_sort = [c for c in sort_cols if c not in working.columns]
|
|
95
|
+
if missing_sort:
|
|
96
|
+
raise KeyError(f"sort_by columns not found in DataFrame: {missing_sort}")
|
|
97
|
+
working = working.sort_values(sort_cols, kind="stable")
|
|
98
|
+
|
|
99
|
+
# Validation of placeholders (if requested)
|
|
100
|
+
if validate:
|
|
101
|
+
df_cols = set(working.columns)
|
|
102
|
+
row_keys = set(_placeholders(row_format))
|
|
103
|
+
header_keys = set(_placeholders(header_format))
|
|
104
|
+
# Context keys provided by the function
|
|
105
|
+
provided_keys = {"i", "group_value", "count", "plural"}
|
|
106
|
+
constant_keys = set((constants or {}).keys())
|
|
107
|
+
|
|
108
|
+
missing_row = [k for k in row_keys if k not in df_cols and k not in constant_keys and k not in provided_keys]
|
|
109
|
+
missing_header = [k for k in header_keys if k not in provided_keys and k not in constant_keys and k not in df_cols]
|
|
110
|
+
if missing_row:
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"row_format references missing keys: {missing_row}. "
|
|
113
|
+
f"Ensure these are either df columns or in `constants`."
|
|
114
|
+
)
|
|
115
|
+
if missing_header:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"header_format references missing keys: {missing_header}. "
|
|
118
|
+
f"Use only {{group_value}}, {{count}}, {{plural}} or provide constants."
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Build summaries per group
|
|
122
|
+
summaries = []
|
|
123
|
+
for group_value, group_df in working.groupby(agg_field, sort=True):
|
|
124
|
+
group_df = group_df.reset_index(drop=True)
|
|
125
|
+
count = len(group_df)
|
|
126
|
+
plural = "" if count == 1 else "s"
|
|
127
|
+
|
|
128
|
+
header_ctx = {
|
|
129
|
+
"group_value": _safe_str(group_value),
|
|
130
|
+
"count": count,
|
|
131
|
+
"plural": plural,
|
|
132
|
+
**(constants or {}),
|
|
133
|
+
}
|
|
134
|
+
header = header_format.format(**header_ctx)
|
|
135
|
+
|
|
136
|
+
lines = []
|
|
137
|
+
for i, row in enumerate(group_df.to_dict(orient="records"), start=1):
|
|
138
|
+
# Row context = df row + sequence + constants (constants override df if same key)
|
|
139
|
+
row_ctx = {k: _safe_str(v) for k, v in row.items()}
|
|
140
|
+
row_ctx.update({"i": i})
|
|
141
|
+
if constants:
|
|
142
|
+
# Constants override row values with same keys
|
|
143
|
+
row_ctx.update(constants)
|
|
144
|
+
|
|
145
|
+
lines.append(row_format.format(**row_ctx))
|
|
146
|
+
|
|
147
|
+
content = header + " " + " ".join(lines)
|
|
148
|
+
summaries.append(
|
|
149
|
+
{"content" : content, "id": group_value}
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return summaries
|
|
@@ -1,33 +1,36 @@
|
|
|
1
1
|
datasourcelib/__init__.py,sha256=I7JTSZ1J6ULg_TfdMEgFcd1regkCHuyKdZT4DcPtoyQ,78
|
|
2
2
|
datasourcelib/core/__init__.py,sha256=nsXojDd97T7eMqqtCsZr1qSYLBitvKydSZRb9Dg7hqU,462
|
|
3
|
-
datasourcelib/core/sync_base.py,sha256=
|
|
4
|
-
datasourcelib/core/sync_manager.py,sha256=
|
|
3
|
+
datasourcelib/core/sync_base.py,sha256=fKbsJYtPIV0ow7sGH7O7GmAEeeSefvD16LBOz0dP4TU,726
|
|
4
|
+
datasourcelib/core/sync_manager.py,sha256=pfnvWv4AwmlJJUIsfxNNxYDBOsa7juTIxgFJIEZ5bIM,4842
|
|
5
5
|
datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
|
|
6
6
|
datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
|
|
7
7
|
datasourcelib/datasources/azure_devops_source copy.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
|
|
8
8
|
datasourcelib/datasources/azure_devops_source.py,sha256=3hyZIrUdgwZEQNjb2iZGDMJcAw3Z6r7oV0hWAq_zMsg,8005
|
|
9
9
|
datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
|
|
10
10
|
datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
|
|
11
|
-
datasourcelib/datasources/datasource_types.py,sha256=
|
|
11
|
+
datasourcelib/datasources/datasource_types.py,sha256=jpm4f9n1l7X9aBD58Pbr9evXiCHHEhRCLojGwchUD7A,205
|
|
12
|
+
datasourcelib/datasources/dataverse_source.py,sha256=PTIWArl_rRMap5QfH8ST5kCewE0Ax1xPZ1vgSxeujpU,14080
|
|
12
13
|
datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
|
|
13
14
|
datasourcelib/datasources/sharepoint_source.py,sha256=t3rly2mVEI2qEDuUVqstck5ktkZW0BnF16Bke_NjPLI,23126
|
|
14
15
|
datasourcelib/datasources/sql_source.py,sha256=ntZjiFXpa7V797x7mAATJV0LH-g878VHuRw-QTxEe28,6372
|
|
16
|
+
datasourcelib/datasources/sql_source_bkup.py,sha256=ntZjiFXpa7V797x7mAATJV0LH-g878VHuRw-QTxEe28,6372
|
|
15
17
|
datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
|
|
16
18
|
datasourcelib/indexes/azure_search_index.py,sha256=kznAz06UXgyT1Clqj6gRhnBQ5HFw40ZQHJElRFIcbRo,22115
|
|
17
19
|
datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
|
|
18
|
-
datasourcelib/strategies/daily_load.py,sha256=
|
|
19
|
-
datasourcelib/strategies/full_load.py,sha256=
|
|
20
|
-
datasourcelib/strategies/incremental_load.py,sha256=
|
|
21
|
-
datasourcelib/strategies/ondemand_load.py,sha256=
|
|
22
|
-
datasourcelib/strategies/timerange_load.py,sha256=
|
|
20
|
+
datasourcelib/strategies/daily_load.py,sha256=A9BnPqPfbPO8UeBy-jtS53eORK7QWWqLOWHrtyFLbl4,1909
|
|
21
|
+
datasourcelib/strategies/full_load.py,sha256=4BS_g4loR28OVqSDwXBCH2jCKbJLZxx6354KCOi_Qjk,4020
|
|
22
|
+
datasourcelib/strategies/incremental_load.py,sha256=CY1tAyXwjZLoq5zMLwB5i5qmT_L8JBaiBxDy9hx8QkQ,1822
|
|
23
|
+
datasourcelib/strategies/ondemand_load.py,sha256=MgenKJbJePLeErdEkXKsz1h7RuR8yT0RV_X523G7UUs,1304
|
|
24
|
+
datasourcelib/strategies/timerange_load.py,sha256=W_sSZg059Lw2o9tmdGKM9D5-z1pph7AN1ftalXhuyjo,1557
|
|
23
25
|
datasourcelib/utils/__init__.py,sha256=9pSIpaK-kdmNuDzwl0Z7QU-_lV3cZE-iwOEPh3RBBTs,298
|
|
26
|
+
datasourcelib/utils/aggregation.py,sha256=5aOBcxay4eTyY-S4BRafNgSi37AY-JXERzcCv055E8w,6060
|
|
24
27
|
datasourcelib/utils/byte_reader.py,sha256=GaoPXwJa2YTWG1Kim0K6JG20eVSaWkZJd1o9bswxHmc,9082
|
|
25
28
|
datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4fYQww,369
|
|
26
29
|
datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
|
|
27
30
|
datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
|
|
28
31
|
datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
|
|
29
|
-
datasourcelib-0.1.
|
|
30
|
-
datasourcelib-0.1.
|
|
31
|
-
datasourcelib-0.1.
|
|
32
|
-
datasourcelib-0.1.
|
|
33
|
-
datasourcelib-0.1.
|
|
32
|
+
datasourcelib-0.1.7.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
|
|
33
|
+
datasourcelib-0.1.7.dist-info/METADATA,sha256=Jo1RgpvptXpS-FxA6g9-7rVkknZDfzUrOpMQVFxG-9Y,1199
|
|
34
|
+
datasourcelib-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
datasourcelib-0.1.7.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
|
|
36
|
+
datasourcelib-0.1.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|