datasourcelib 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ from typing import Any, Dict, List, Optional
2
+ from datasourcelib.datasources.datasource_base import DataSourceBase
3
+ from datasourcelib.utils.logger import get_logger
4
+ from datasourcelib.utils.validators import require_keys
5
+ import base64
6
+ import json
7
+ from bs4 import BeautifulSoup
8
+
9
+ logger = get_logger(__name__)
10
+
11
+ try:
12
+ import requests # type: ignore
13
+ except Exception:
14
+ requests = None # lazy import handled at runtime
15
+
16
+ class AzureDevOpsSource(DataSourceBase):
17
+
18
+ def validate_config(self) -> bool:
19
+ try:
20
+ require_keys(self.config, ["ado_organization", "ado_personal_access_token","ado_project","ado_query_id"])
21
+ return True
22
+ except Exception as ex:
23
+ logger.error("AzureDevOpsSource.validate_config: %s", ex)
24
+ return False
25
+
26
+ def connect(self) -> bool:
27
+ if requests is None:
28
+ raise RuntimeError("requests package is required for AzureDevOpsSource")
29
+ # No persistent connection; store auth header
30
+ pat = self.config.get("ado_personal_access_token")
31
+ token = pat
32
+ token_b64 = base64.b64encode(token.encode("utf-8")).decode("utf-8")
33
+ self._headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
34
+ self._connected = True
35
+ logger.info("AzureDevOpsSource ready (no persistent connection required)")
36
+ return True
37
+
38
+ def disconnect(self) -> None:
39
+ self._headers = {}
40
+ self._connected = False
41
+ logger.info("AzureDevOpsSource cleared")
42
+
43
+ def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
44
+ if requests is None:
45
+ raise RuntimeError("requests package is required for AzureDevOpsSource")
46
+ if not getattr(self, "_connected", False):
47
+ self.connect()
48
+
49
+ org = self.config.get("ado_organization")
50
+ project = self.config.get("ado_project")
51
+ query_id = self.config.get("ado_query_id")
52
+ api_version = self.config.get("api_version", "7.1")
53
+ #path = self.config.get("query_path", query or "")
54
+ if not query_id:
55
+ raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
56
+
57
+ base = f"https://dev.azure.com/{org}/"
58
+ if project:
59
+ base = f"{base}{project}/"
60
+ url = f"{base}_apis/wit/wiql/{query_id}"
61
+ params = {"api-version": api_version}
62
+ method = self.config.get("method", "GET").upper()
63
+ query_response = requests.request(method, url, headers=getattr(self, "_headers", {}), params=params) #, json=self.config.get("payload")
64
+ query_response.raise_for_status()
65
+ #data = resp.json()
66
+ # Check if the request was successful
67
+ if query_response.status_code == 200:
68
+ work_items = query_response.json()['workItems']
69
+ work_item_details = []
70
+
71
+ # Loop through each work item ID to get detailed information
72
+ for item in work_items:
73
+ work_item_id = item['id']
74
+ work_item_url = f'https://dev.azure.com/{org}/{project}/_apis/wit/workitems/{work_item_id}?api-version=7.1'
75
+ work_item_response = requests.get(work_item_url, headers=getattr(self, "_headers", {}))
76
+
77
+ if work_item_response.status_code == 200:
78
+ logger.info(f"Current Item: {work_item_id}")
79
+ text = work_item_response.json()['fields']['System.Description']
80
+ c_desc=BeautifulSoup(text, "html.parser").get_text()
81
+ c_changedate = work_item_response.json()['fields']['System.ChangedDate']
82
+ c_title = work_item_response.json()['fields']['System.Title']
83
+ c_status = work_item_response.json()['fields']['System.State']
84
+ c_type = work_item_response.json()['fields']['System.WorkItemType']
85
+ c_created = work_item_response.json()['fields']['System.CreatedDate']
86
+
87
+ default_value = "-VALUE NOT ASSIGNED-"
88
+ c_assigned = work_item_response.json()['fields'].get('System.AssignedTo',{}).get('displayName',default_value)
89
+ logger.info(c_assigned)
90
+ c_tags = work_item_response.json()['fields'].get('System.Tags',default_value)
91
+ c_project = work_item_response.json()['fields'].get('Custom.ProjectName',default_value)
92
+ c_rtype = work_item_response.json()['fields'].get('Custom.Releasetype',default_value)
93
+ c_rdate = work_item_response.json()['fields'].get('Microsoft.VSTS.Scheduling.TargetDate',default_value)
94
+
95
+ #fullfeature = f"{c_type} ID {work_item_id} was created on {c_created} for a {c_rtype} release of Project '{c_project}' with target date '{c_rdate}' and has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned} and last modified on {c_changedate}.Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
96
+ fullfeature = f"{c_type} ID {work_item_id} was created on {c_created}. {c_type} ID {work_item_id} is a {c_rtype} release of Project '{c_project}'. {c_type} ID {work_item_id} Release has target date '{c_rdate}'.{c_type} ID {work_item_id} has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned}. {c_type} ID {work_item_id} is last modified on {c_changedate}. Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
97
+ # Ensure work_item_details is a list and append a dict for this work item
98
+
99
+ work_item_details.append({
100
+ "id": work_item_id,
101
+ "type": c_type,
102
+ "title": c_title,
103
+ "status": c_status,
104
+ "assigned_to": c_assigned,
105
+ "created": c_created,
106
+ "changed_date": c_changedate,
107
+ "tags": c_tags,
108
+ "release_type": c_rtype,
109
+ "target_date": c_rdate,
110
+ "project": c_project,
111
+ "description": c_desc,
112
+ "full": fullfeature
113
+ })
114
+ else:
115
+ logger.error(f"Error fetching details for work item ID {work_item_id}: {work_item_response.status_code}")
116
+
117
+ #work_item_desc = []
118
+ #for desc in work_item_details:
119
+ # work_item_desc.append(desc['fields']['System.Description'])
120
+
121
+
122
+ return work_item_details #[{"response": json.dumps(work_item_details)}]
123
+ else:
124
+ raise RuntimeError(f"Error: {query_response.status_code}")
125
+ # Caller decides how to interpret the payload; default: return raw json in a single-item list
126
+
@@ -50,77 +50,135 @@ class AzureDevOpsSource(DataSourceBase):
50
50
  project = self.config.get("ado_project")
51
51
  query_id = self.config.get("ado_query_id")
52
52
  api_version = self.config.get("api_version", "7.1")
53
- #path = self.config.get("query_path", query or "")
54
53
  if not query_id:
55
54
  raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
56
55
 
57
56
  base = f"https://dev.azure.com/{org}/"
58
57
  if project:
59
58
  base = f"{base}{project}/"
60
- url = f"{base}_apis/wit/wiql/{query_id}"
59
+ # WIQL query by id (returns list of work item refs)
60
+ wiql_url = f"{base}_apis/wit/wiql/{query_id}"
61
61
  params = {"api-version": api_version}
62
62
  method = self.config.get("method", "GET").upper()
63
- query_response = requests.request(method, url, headers=getattr(self, "_headers", {}), params=params) #, json=self.config.get("payload")
63
+ query_response = requests.request(method, wiql_url, headers=getattr(self, "_headers", {}), params=params)
64
64
  query_response.raise_for_status()
65
- #data = resp.json()
66
- # Check if the request was successful
67
- if query_response.status_code == 200:
68
- work_items = query_response.json()['workItems']
69
- work_item_details = []
70
-
71
- # Loop through each work item ID to get detailed information
72
- for item in work_items:
73
- work_item_id = item['id']
74
- work_item_url = f'https://dev.azure.com/{org}/{project}/_apis/wit/workitems/{work_item_id}?api-version=7.1'
75
- work_item_response = requests.get(work_item_url, headers=getattr(self, "_headers", {}))
76
-
77
- if work_item_response.status_code == 200:
78
- logger.info(f"Current Item: {work_item_id}")
79
- text = work_item_response.json()['fields']['System.Description']
80
- c_desc=BeautifulSoup(text, "html.parser").get_text()
81
- c_changedate = work_item_response.json()['fields']['System.ChangedDate']
82
- c_title = work_item_response.json()['fields']['System.Title']
83
- c_status = work_item_response.json()['fields']['System.State']
84
- c_type = work_item_response.json()['fields']['System.WorkItemType']
85
- c_created = work_item_response.json()['fields']['System.CreatedDate']
86
-
87
- default_value = "-VALUE NOT ASSIGNED-"
88
- c_assigned = work_item_response.json()['fields'].get('System.AssignedTo',{}).get('displayName',default_value)
89
- logger.info(c_assigned)
90
- c_tags = work_item_response.json()['fields'].get('System.Tags',default_value)
91
- c_project = work_item_response.json()['fields'].get('Custom.ProjectName',default_value)
92
- c_rtype = work_item_response.json()['fields'].get('Custom.Releasetype',default_value)
93
- c_rdate = work_item_response.json()['fields'].get('Microsoft.VSTS.Scheduling.TargetDate',default_value)
94
-
95
- #fullfeature = f"{c_type} ID {work_item_id} was created on {c_created} for a {c_rtype} release of Project '{c_project}' with target date '{c_rdate}' and has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned} and last modified on {c_changedate}.Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
96
- fullfeature = f"{c_type} ID {work_item_id} was created on {c_created}. {c_type} ID {work_item_id} is a {c_rtype} release of Project '{c_project}'. {c_type} ID {work_item_id} Release has target date '{c_rdate}'.{c_type} ID {work_item_id} has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned}. {c_type} ID {work_item_id} is last modified on {c_changedate}. Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
97
- # Ensure work_item_details is a list and append a dict for this work item
98
-
99
- work_item_details.append({
100
- "id": work_item_id,
101
- "type": c_type,
102
- "title": c_title,
103
- "status": c_status,
104
- "assigned_to": c_assigned,
105
- "created": c_created,
106
- "changed_date": c_changedate,
107
- "tags": c_tags,
108
- "release_type": c_rtype,
109
- "target_date": c_rdate,
110
- "project": c_project,
111
- "description": c_desc,
112
- "full": fullfeature
113
- })
114
- else:
115
- logger.error(f"Error fetching details for work item ID {work_item_id}: {work_item_response.status_code}")
116
-
117
- #work_item_desc = []
118
- #for desc in work_item_details:
119
- # work_item_desc.append(desc['fields']['System.Description'])
120
-
121
-
122
- return work_item_details #[{"response": json.dumps(work_item_details)}]
123
- else:
65
+
66
+ if query_response.status_code != 200:
124
67
  raise RuntimeError(f"Error: {query_response.status_code}")
125
- # Caller decides how to interpret the payload; default: return raw json in a single-item list
126
-
68
+
69
+ work_items_refs = query_response.json().get('workItems', []) or []
70
+ if not work_items_refs:
71
+ return []
72
+
73
+ # collect ids and fetch details in batch to get all fields for all work item types
74
+ ids = [str(item.get('id')) for item in work_items_refs if item.get('id')]
75
+ if not ids:
76
+ return []
77
+
78
+ details_url = f"https://dev.azure.com/{org}/{project}/_apis/wit/workitems"
79
+ # expand=all to include fields, relations, and attachments
80
+ params = {
81
+ "ids": ",".join(ids),
82
+ "api-version": api_version,
83
+ "$expand": "all"
84
+ }
85
+ details_resp = requests.get(details_url, headers=getattr(self, "_headers", {}), params=params)
86
+ details_resp.raise_for_status()
87
+ items = details_resp.json().get("value", [])
88
+
89
+ work_item_details: List[Dict[str, Any]] = []
90
+ for item in items:
91
+ item_id = item.get("id")
92
+ fields = item.get("fields", {}) or {}
93
+
94
+ # Normalize field keys to safe snake_case-like keys
95
+ norm_fields: Dict[str, Any] = {}
96
+ for k, v in fields.items():
97
+ nk = k.replace(".", "_")
98
+ nk = nk.lower()
99
+ norm_fields[nk] = v
100
+
101
+ # Helper to safely extract nested displayName for assigned to
102
+ assigned = norm_fields.get("system_assignedto")
103
+ if isinstance(assigned, dict):
104
+ assigned_to = assigned.get("displayName") or assigned.get("uniqueName") or str(assigned)
105
+ else:
106
+ assigned_to = assigned
107
+
108
+ # find a description-like field (some types use different field names)
109
+ desc = ""
110
+ for fk in ["system_description", "microsoft_vsts_createdby", "html_description"]:
111
+ if fk in norm_fields:
112
+ desc = norm_fields.get(fk) or ""
113
+ break
114
+ if not desc:
115
+ # fallback: first field key that contains 'description'
116
+ for kf, vf in norm_fields.items():
117
+ if "description" in kf and vf:
118
+ desc = vf
119
+ break
120
+
121
+ # clean HTML description to text
122
+ try:
123
+ c_desc = BeautifulSoup(desc or "", "html.parser").get_text()
124
+ except Exception:
125
+ c_desc = desc or ""
126
+
127
+ # Build common convenience values (use available fields)
128
+ wi_type = norm_fields.get("system_workitemtype") or norm_fields.get("system_witype") or ""
129
+ title = norm_fields.get("system_title") or ""
130
+ status = norm_fields.get("system_state") or ""
131
+ created = norm_fields.get("system_createddate") or norm_fields.get("system_created") or ""
132
+ changed = norm_fields.get("system_changeddate") or norm_fields.get("system_changed") or ""
133
+ tags = norm_fields.get("system_tags", "")
134
+ project_name = norm_fields.get("custom.projectname") or norm_fields.get("system_teamproject") or ""
135
+
136
+ rtype = norm_fields.get("custom.releasetype") or norm_fields.get("custom_releasetype") or ""
137
+ target_date = norm_fields.get("microsoft_vsts_scheduling_targetdate") or norm_fields.get("microsoft.vsts.scheduling.targetdate") or ""
138
+
139
+ # Construct a 'full' description string using available pieces
140
+ parts = []
141
+ if wi_type:
142
+ parts.append(f"{wi_type} ID {item_id}")
143
+ else:
144
+ parts.append(f"WorkItem {item_id}")
145
+ if created:
146
+ parts.append(f"was created on {created}")
147
+ if title:
148
+ parts.append(f"and has Title '{title}'")
149
+ if status:
150
+ parts.append(f"is currently in {status} state")
151
+ if assigned_to:
152
+ parts.append(f"is assigned to {assigned_to}")
153
+ if project_name:
154
+ parts.append(f"for Project '{project_name}'")
155
+ if rtype:
156
+ parts.append(f"release type '{rtype}'")
157
+ if target_date:
158
+ parts.append(f"with target date '{target_date}'")
159
+ if tags:
160
+ parts.append(f"Tags: {tags}")
161
+ if c_desc:
162
+ parts.append(f"Description: [{c_desc}]")
163
+ fullfeature = ". ".join(parts)
164
+
165
+ # include all normalized fields in the returned object for completeness
166
+ entry = {
167
+ "id": item_id,
168
+ "type": wi_type,
169
+ "title": title,
170
+ "status": status,
171
+ "assigned_to": assigned_to,
172
+ "created": created,
173
+ "changed_date": changed,
174
+ "tags": tags,
175
+ "project": project_name,
176
+ "release_type": rtype,
177
+ "target_date": target_date,
178
+ "description": c_desc,
179
+ "full": fullfeature,
180
+ "fields": norm_fields # full field set for this work item
181
+ }
182
+ work_item_details.append(entry)
183
+
184
+ return work_item_details
@@ -7,6 +7,7 @@ import requests
7
7
  import pandas as pd
8
8
  import os
9
9
  from uuid import uuid4
10
+ from datetime import datetime, timedelta
10
11
 
11
12
  logger = get_logger(__name__)
12
13
  reader = ByteReader()
@@ -114,50 +115,86 @@ class SharePointSource(DataSourceBase):
114
115
  self._drive_id = drives[0].get("id")
115
116
  logger.info("Resolved SharePoint drive ID: %s", self._drive_id)
116
117
 
118
+ def _get_client_credentials(self) -> Tuple[str, str]:
119
+ """Retrieve client credentials in order of priority: sp_download_config, sp_client_config, sp_master_config."""
120
+ # Fallback to sp_client_config
121
+ sp_client_config = self.config.get("sp_client_config", {})
122
+ client_id = sp_client_config.get("sp_client_id")
123
+ client_secret = sp_client_config.get("sp_client_secret")
124
+
125
+ if not client_id or not client_secret:
126
+ # Fallback to sp_master_config
127
+ sp_master_config = self.config.get("sp_master_config", {})
128
+ client_id = client_id or sp_master_config.get("sp_client_id")
129
+ client_secret = client_secret or sp_master_config.get("sp_client_secret")
130
+
131
+ if not client_id or not client_secret:
132
+ raise ValueError("Client ID and Client Secret must be provided in the configuration.")
133
+
134
+ return client_id, client_secret
135
+
136
+ def _get_download_credentials(self) -> Tuple[str, str]:
137
+ """Retrieve client credentials in order of priority: sp_download_config, sp_client_config, sp_master_config."""
138
+ # Check sp_download_config first
139
+ sp_download_config = self.config.get("sp_client_config", {}).get("sp_download_config", {})
140
+ client_id = sp_download_config.get("sp_client_id")
141
+ client_secret = sp_download_config.get("sp_client_secret")
142
+
143
+ if not client_id or not client_secret:
144
+ # Fallback to sp_client_config
145
+ sp_client_config = self.config.get("sp_client_config", {})
146
+ client_id = client_id or sp_client_config.get("sp_client_id")
147
+ client_secret = client_secret or sp_client_config.get("sp_client_secret")
148
+
149
+ if not client_id or not client_secret:
150
+ # Fallback to sp_master_config
151
+ sp_master_config = self.config.get("sp_master_config", {})
152
+ client_id = client_id or sp_master_config.get("sp_client_id")
153
+ client_secret = client_secret or sp_master_config.get("sp_client_secret")
154
+
155
+ if not client_id or not client_secret:
156
+ raise ValueError("Client ID and Client Secret must be provided in the configuration.")
157
+
158
+ return client_id, client_secret
159
+
160
+
117
161
  def connect(self) -> bool:
118
162
  try:
119
163
  # basic values
120
164
  self._site_url = self.config["sp_site_url"]
121
- client_config = self.config["sp_client_config"]
122
165
  master_config = self.config["sp_master_config"]
123
166
 
124
167
  # get master token (Sites.Read.All)
125
168
  try:
126
- self._master_token = self._get_token(
127
- master_config["sp_client_id"], master_config["sp_client_secret"], master_config["sp_tenant_id"]
128
- )
169
+ master_client_id = master_config["sp_client_id"]
170
+ master_client_secret = master_config["sp_client_secret"]
171
+ self._master_token = self._get_token(master_client_id, master_client_secret, master_config["sp_tenant_id"])
129
172
  logger.info("$$$ - Obtained master access token for SharePoint - $$$")
130
173
  except Exception as ex:
131
174
  logger.info("$$$ - Failed to obtain master token - $$$")
132
175
 
133
176
  # resolve site and drive ids
134
177
  try:
135
- self._resolve_site_and_drive(
136
- self.config['sp_site_display_name']
137
- )
178
+ self._resolve_site_and_drive(self.config['sp_site_display_name'])
138
179
  except Exception:
139
180
  logger.info("$$$ - Failed to resolve site/drive - $$$")
140
-
181
+
141
182
  # get client token (Site.Selected) for download operations
142
183
  try:
143
- # use master tenant id for tenant
144
- self._access_token = self._get_token(
145
- client_config["sp_client_id"], client_config["sp_client_secret"], master_config["sp_tenant_id"]
146
- )
184
+ client_id, client_secret = self._get_client_credentials()
185
+ self._access_token = self._get_token(client_id, client_secret, master_config["sp_tenant_id"])
147
186
  logger.info("$$$ - Obtained client access token for SharePoint downloads - $$$")
148
187
  except Exception:
149
188
  logger.info("$$$ - Failed to obtain client access token - $$$")
150
-
189
+
151
190
  # get list client token (Site.Selected) for list operations
152
191
  try:
153
- # use master tenant id for tenant
154
- self._list_token = self._get_list_token(
155
- client_config["sp_client_id"], client_config["sp_client_secret"], master_config["sp_tenant_id"],master_config["sp_domain_name"]
156
- )
192
+ client_id, client_secret = self._get_client_credentials()
193
+ self._list_token = self._get_list_token(client_id, client_secret, master_config["sp_tenant_id"], master_config["sp_domain_name"])
157
194
  logger.info("$$$ - Obtained client list token for SharePoint list operations - $$$")
158
195
  except Exception:
159
196
  logger.info("$$$ - Failed to obtain client list token - $$$")
160
-
197
+
161
198
  self._connected = True
162
199
  logger.info("SharePointSource connected for site: %s", self._site_url)
163
200
  return True
@@ -324,10 +361,9 @@ class SharePointSource(DataSourceBase):
324
361
  results = []
325
362
  items = self._fetch_list_items_via_rest(relative_path)
326
363
 
327
- if str(self.config.get("sp_client_config",{}).get("sp_download_config",{})["sp_client_id"]):
328
- self._access_token = self._get_token(
329
- self.config.get("sp_client_config",{}).get("sp_download_config",{})["sp_client_id"], self.config.get("sp_client_config",{}).get("sp_download_config",{})["sp_client_secret"], self.config.get("sp_master_config",{})["sp_tenant_id"]
330
- )
364
+ client_id, client_secret = self._get_download_credentials()
365
+
366
+ self._access_token = self._get_token(client_id, client_secret, self.config.get("sp_master_config",{})["sp_tenant_id"])
331
367
  #test running with hardcoded items
332
368
  if False:
333
369
  items = []
@@ -350,12 +386,29 @@ class SharePointSource(DataSourceBase):
350
386
  })
351
387
 
352
388
  for item in items:
353
- item_name = item.get("Title")
354
- item_display_name = item.get("SiteDisplayName")
389
+ #the path after [Shared Documents/] in relative path
390
+ item_relative_path = item.get("RelativePath") or item.get("relativepath") or item.get("relativePath")
391
+ item_name = item.get("Title") or item.get("title")
392
+ item_display_name = item.get("SiteDisplayName") or item.get("sitedisplayname") or item.get("siteDisplayName")
393
+
394
+ # Check ModifiedDate filter
395
+ # "2024-01-15" → 10 chars || "20240115" → 8 chars
396
+ modified_date_str = item.get("ModifiedDate") or item.get("modifieddate") or item.get("modifiedDate")
397
+ if modified_date_str:
398
+ try:
399
+ modified_date = datetime.fromisoformat(modified_date_str.replace('Z', '+00:00'))
400
+ if datetime.now(modified_date.tzinfo) - modified_date < timedelta(days=1):
401
+ continue
402
+ except Exception:
403
+ pass
404
+
405
+ if not item_relative_path:
406
+ logger.warning("Item missing RelativePath: %s", item)
407
+ continue
408
+
355
409
  #get site id and drive id for this item
356
410
  self._resolve_site_and_drive(item_display_name)
357
- #the path after [Shared Documents/] in relative path
358
- item_relative_path = item.get("RelativePath")
411
+
359
412
  try:
360
413
  content, filename = self._download_file_bytes(item_relative_path)
361
414
  saved = self._save_file_if_requested(content, filename, save_path)
@@ -16,8 +16,16 @@ class SQLDataSource(DataSourceBase):
16
16
  self._is_sqlite = False
17
17
 
18
18
  def validate_config(self) -> bool:
19
+ """
20
+ Validate config. If sql_windows_auth is True then sql_username/sql_password are optional.
21
+ Otherwise require sql_username and sql_password.
22
+ """
19
23
  try:
20
- require_keys(self.config, ["sql_server","sql_database","sql_username","sql_password","sql_is_onprem"])
24
+ # Always require server/database at minimum
25
+ require_keys(self.config, ["sql_server", "sql_database"])
26
+ # If not using Windows authentication, require credentials
27
+ if not bool(self.config.get("sql_windows_auth", False)):
28
+ require_keys(self.config, ["sql_username", "sql_password"])
21
29
  return True
22
30
  except Exception as ex:
23
31
  logger.error("SQLDataSource.validate_config: %s", ex)
@@ -27,22 +35,31 @@ class SQLDataSource(DataSourceBase):
27
35
  try:
28
36
  sql_server = self.config.get("sql_server", "")
29
37
  sql_database = self.config.get("sql_database", "")
30
- sql_username = self.config.get("sql_username", "")
31
- sql_password = self.config.get("sql_password", "")
32
38
  sql_is_onprem = self.config.get("sql_is_onprem", False)
33
-
39
+
40
+ # Determine auth mode: sql_windows_auth (Trusted Connection) overrides username/password
41
+ sql_windows_auth = bool(self.config.get("sql_windows_auth", False))
42
+
34
43
  # Get available driver
35
44
  sql_driver = self._get_available_driver()
36
-
37
- # Build connection string with appropriate encryption settings
45
+
46
+ # Build connection string
38
47
  conn_params = [
39
48
  f'DRIVER={sql_driver}',
40
49
  f'SERVER={sql_server}',
41
50
  f'DATABASE={sql_database}',
42
- f'UID={sql_username}',
43
- f'PWD={sql_password}'
44
51
  ]
45
-
52
+
53
+ if sql_windows_auth:
54
+ # Use integrated Windows authentication (Trusted Connection)
55
+ # This will use the current process credentials / kerberos ticket.
56
+ conn_params.append('Trusted_Connection=yes')
57
+ logger.info("SQLDataSource using Windows (integrated) authentication")
58
+ else:
59
+ sql_username = self.config.get("sql_username", "")
60
+ sql_password = self.config.get("sql_password", "")
61
+ conn_params.extend([f'UID={sql_username}', f'PWD={sql_password}'])
62
+
46
63
  # Add encryption settings based on environment
47
64
  if not sql_is_onprem:
48
65
  conn_params.extend([
@@ -56,13 +73,13 @@ class SQLDataSource(DataSourceBase):
56
73
  ])
57
74
 
58
75
  conn_str = ';'.join(conn_params)
59
-
76
+
60
77
  # Attempt connection with timeout
61
78
  self._conn = pyodbc.connect(conn_str, timeout=30)
62
79
  self._connected = True
63
- logger.info("SQLDataSource connected to %s using driver %s", sql_server, sql_driver)
80
+ logger.info("SQLDataSource connected to %s using driver %s (sql_windows_auth=%s)", sql_server, sql_driver, sql_windows_auth)
64
81
  return True
65
-
82
+
66
83
  except pyodbc.Error as ex:
67
84
  logger.error("SQLDataSource.connect failed - ODBC Error: %s", ex)
68
85
  self._connected = False
@@ -110,7 +110,7 @@ class AzureSearchIndexer:
110
110
  logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
111
111
  raise
112
112
 
113
- def _build_vector_search_config(self):
113
+ def _build_vector_search_config_old(self):
114
114
  AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
115
115
  vector_config = self.config.get("vector_config", {})
116
116
  dimensions = vector_config.get("dimensions", 1536)
@@ -121,6 +121,107 @@ class AzureSearchIndexer:
121
121
  )
122
122
 
123
123
  return vector_search, dimensions
124
+
125
+ def _build_vector_search_config(self):
126
+ AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
127
+
128
+ vector_config = self.config.get("vector_config", {})
129
+ dimensions = vector_config.get("dimensions", 1536)
130
+ algorithm = vector_config.get("algorithm", "hnsw").lower()
131
+
132
+ # Build algorithm configuration (SDK model if available)
133
+ alg_cfg = HnswAlgorithmConfiguration(name="algorithms-config-1")
134
+
135
+ # Build vectorizer settings using Azure OpenAI config from vector_db_config
136
+ deployment = self.config.get("embedding_deployment")
137
+ endpoint = self.config.get("embedding_endpoint")
138
+ api_key = self.config.get("embedding_key")
139
+ # modelName required for API version 2025-09-01 — prefer explicit embedding_model, fall back to deployment
140
+ model_name = self.config.get("embedding_model") or deployment
141
+ content_field = self.config.get("content_field", "content")
142
+ vector_field = self.config.get("vector_field", "contentVector")
143
+
144
+ if not model_name:
145
+ raise RuntimeError("Vectorizer configuration requires 'embedding_model' or 'embedding_deployment' in vector_db_config")
146
+
147
+ # Define vectorizer with explicit name and required azureOpenAIParameters including modelName
148
+ vectorizer_name = "azure-openai-vectorizer"
149
+ vectorizer = {
150
+ "name": vectorizer_name,
151
+ "kind": "azureOpenAI",
152
+ "azureOpenAIParameters": {
153
+ "resourceUri": endpoint.rstrip('/') if endpoint else None,
154
+ # include both modelName (required) and deploymentId (if provided)
155
+ "modelName": model_name,
156
+ **({"deploymentId": deployment} if deployment else {}),
157
+ "apiKey": api_key
158
+ },
159
+ "options": {
160
+ "fieldMapping": [
161
+ {
162
+ "sourceContext": f"/document/{content_field}",
163
+ "outputs": [
164
+ {
165
+ "targetContext": f"/document/{vector_field}",
166
+ "targetDimensions": dimensions
167
+ }
168
+ ]
169
+ }
170
+ ]
171
+ }
172
+ }
173
+
174
+ profile_name = "vector-profile-1"
175
+ try:
176
+ # Create profile with vectorizer reference (SDK may expect vectorizer_name or vectorizer depending on version)
177
+ try:
178
+ profile = VectorSearchProfile(
179
+ name=profile_name,
180
+ algorithm_configuration_name="algorithms-config-1",
181
+ vectorizer_name=vectorizer_name
182
+ )
183
+ except TypeError:
184
+ # fallback if SDK constructor uses different parameter names
185
+ profile = VectorSearchProfile(name=profile_name, algorithm_configuration_name="algorithms-config-1")
186
+ try:
187
+ setattr(profile, "vectorizer_name", vectorizer_name)
188
+ except Exception:
189
+ pass
190
+
191
+ try:
192
+ # Construct full vector search config with both profile and vectorizer
193
+ vector_search = VectorSearch(
194
+ profiles=[profile],
195
+ algorithms=[alg_cfg],
196
+ vectorizers=[vectorizer]
197
+ )
198
+ except Exception:
199
+ # Fallback to dict if SDK constructor differs
200
+ vector_search = {
201
+ "profiles": [{
202
+ "name": profile_name,
203
+ "algorithmConfigurationName": "algorithms-config-1",
204
+ "vectorizerName": vectorizer_name
205
+ }],
206
+ "algorithms": [{"name": "algorithms-config-1"}],
207
+ "vectorizers": [vectorizer]
208
+ }
209
+ except Exception:
210
+ # Full dict fallback
211
+ vector_search = {
212
+ "profiles": [{
213
+ "name": profile_name,
214
+ "algorithmConfigurationName": "algorithms-config-1",
215
+ "vectorizerName": vectorizer_name
216
+ }],
217
+ "algorithms": [{"name": "algorithms-config-1"}],
218
+ "vectorizers": [vectorizer]
219
+ }
220
+
221
+ logger.info("Built vector_search config (dimensions=%s, model=%s, vectorizer=%s)",
222
+ dimensions, model_name, vectorizer_name)
223
+ return vector_search, dimensions
224
+
124
225
 
125
226
  def _build_semantic_settings(self):
126
227
  """
@@ -1,6 +1,6 @@
1
1
  from datasourcelib.core.sync_base import SyncBase
2
2
  from datasourcelib.utils.logger import get_logger
3
- from datasourcelib.indexes.azure_search_index_vector import AzureSearchIndexer
3
+ from datasourcelib.indexes.azure_search_index import AzureSearchIndexer
4
4
  logger = get_logger(__name__)
5
5
 
6
6
  class FullLoadStrategy(SyncBase):
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Data source sync strategies for vector DBs
5
- Home-page: https://github.com/jaiprakash0217/datasourcelib
6
- Author: Jai Prakash
7
- Author-email: jai.prakash@jai12ka4.com
5
+ Home-page: https://github.com/akashmaurya0217/datasourcelib
6
+ Author: Akash Kumar Maurya
7
+ Author-email: mrelectronicsarduino@gmail.com
8
8
  Classifier: Development Status :: 3 - Alpha
9
9
  Classifier: Intended Audience :: Developers
10
10
  Classifier: License :: OSI Approved :: MIT License
@@ -4,20 +4,19 @@ datasourcelib/core/sync_base.py,sha256=AfwwaV3rJOFKVmKKpSj-BwznnCDCaeuT4LLNDfA3N
4
4
  datasourcelib/core/sync_manager.py,sha256=lj070S3PwSNcB0UL_ZDzDAm6uJ9G38TY491vQZ1dL3o,3849
5
5
  datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
6
6
  datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
7
- datasourcelib/datasources/azure_devops_source.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
7
+ datasourcelib/datasources/azure_devops_source copy.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
8
+ datasourcelib/datasources/azure_devops_source.py,sha256=3hyZIrUdgwZEQNjb2iZGDMJcAw3Z6r7oV0hWAq_zMsg,8005
8
9
  datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
9
10
  datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
10
11
  datasourcelib/datasources/datasource_types.py,sha256=eEiWymYS05X_TxwuB7P3MpphPG1En67h3kRiSGeHjQ0,176
11
12
  datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
12
- datasourcelib/datasources/sharepoint_source.py,sha256=Pv9735Gu2FylVeeT9e_cZlCvgGUwxn-pVRRZQe2PHU8,20196
13
- datasourcelib/datasources/sql_source.py,sha256=sCYHrmeD82fQVcdQjL9Y2TTTjaqlv2v8B5noAng3Bl4,5450
13
+ datasourcelib/datasources/sharepoint_source.py,sha256=t3rly2mVEI2qEDuUVqstck5ktkZW0BnF16Bke_NjPLI,23126
14
+ datasourcelib/datasources/sql_source.py,sha256=ntZjiFXpa7V797x7mAATJV0LH-g878VHuRw-QTxEe28,6372
14
15
  datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
15
- datasourcelib/indexes/azure_search_index.py,sha256=o3BoSxURBk5jCC3AlNz-v9_igg-dXYS4yUxXZwSfqFg,17265
16
- datasourcelib/indexes/azure_search_index_only.py,sha256=SulrYPehWGaf3Wi_Dw8UvFneSY-UwEK9viVYXwIlQuI,7120
17
- datasourcelib/indexes/azure_search_index_vector.py,sha256=4By1vJHv1ORiWOpTqO5wR0sTrq1TaEHP6t8MoOINhok,13410
16
+ datasourcelib/indexes/azure_search_index.py,sha256=kznAz06UXgyT1Clqj6gRhnBQ5HFw40ZQHJElRFIcbRo,22115
18
17
  datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
19
18
  datasourcelib/strategies/daily_load.py,sha256=Rh-veUhxKYsplwHTyko_Zp9C6NkUJV5VAGtg-p7Iy34,856
20
- datasourcelib/strategies/full_load.py,sha256=zqDZZcmyJKXQ4v3coq5njjadlBNI9V8f_lfXVZCoLbQ,1698
19
+ datasourcelib/strategies/full_load.py,sha256=U1a9wO_ZLRnMInvU0IRW-ZKnhu0Cv437VcNMKIYuzMA,1691
21
20
  datasourcelib/strategies/incremental_load.py,sha256=TVqmDLu3m571nqGvzo_69i36QtYe4sBpllFwfPNL0TE,1178
22
21
  datasourcelib/strategies/ondemand_load.py,sha256=VxzAYgrW2ebTOC3xm61CerL2AFehZUJLnKrqtGRGJoE,644
23
22
  datasourcelib/strategies/timerange_load.py,sha256=c62BN2yXwVFaA_dQV54qenP4vrb4rcFqbx6m-nqhaTA,900
@@ -27,8 +26,8 @@ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4f
27
26
  datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
28
27
  datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
29
28
  datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
30
- datasourcelib-0.1.3.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
31
- datasourcelib-0.1.3.dist-info/METADATA,sha256=cPVrPEkPN22sTYOoO20byXcpu5hvKVQIPu3elgyyEko,1185
32
- datasourcelib-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
- datasourcelib-0.1.3.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
34
- datasourcelib-0.1.3.dist-info/RECORD,,
29
+ datasourcelib-0.1.5.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
30
+ datasourcelib-0.1.5.dist-info/METADATA,sha256=jDGgTdya-zt_go_TpEOJNfTQUI7CsbjM4m-Fg51XdqU,1199
31
+ datasourcelib-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
32
+ datasourcelib-0.1.5.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
33
+ datasourcelib-0.1.5.dist-info/RECORD,,
@@ -1,162 +0,0 @@
1
- from typing import List, Dict, Any, Optional
2
- from datasourcelib.utils.logger import get_logger
3
-
4
- logger = get_logger(__name__)
5
-
6
- class AzureSearchIndexer:
7
- """
8
- Minimal Azure Cognitive Search indexer wrapper.
9
- Expects vector_db_config with:
10
- - service_endpoint: str
11
- - index_name: str
12
- - api_key: str
13
- Optional:
14
- - key_field: name of unique key in documents (default 'id')
15
- """
16
-
17
- def __init__(self, vector_db_config: Dict[str, Any]):
18
- self.config = vector_db_config or {}
19
- self._client = None
20
- self._index_client = None
21
-
22
- def validate_config(self) -> bool:
23
- required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
24
- missing = [k for k in required if k not in self.config]
25
- if missing:
26
- logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
27
- return False
28
- return True
29
-
30
- def _ensure_sdk(self):
31
- try:
32
- from azure.core.credentials import AzureKeyCredential # type: ignore
33
- from azure.search.documents import SearchClient # type: ignore
34
- from azure.search.documents.indexes import SearchIndexClient # type: ignore
35
- from azure.search.documents.indexes.models import (
36
- SearchIndex,
37
- SimpleField,
38
- SearchableField,
39
- SearchFieldDataType,
40
- ) # type: ignore
41
- except Exception as e:
42
- raise RuntimeError("azure-search-documents package is required: install azure-search-documents") from e
43
-
44
- return AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType
45
-
46
- def _infer_field_type(self, value) -> Any:
47
- """
48
- Map Python types to SearchFieldDataType
49
- """
50
- *_, SearchFieldDataType = self._ensure_sdk()
51
- if value is None:
52
- return SearchFieldDataType.String
53
- t = type(value)
54
- if t is str:
55
- return SearchFieldDataType.String
56
- if t is bool:
57
- return SearchFieldDataType.Boolean
58
- if t is int:
59
- return SearchFieldDataType.Int32
60
- if t is float:
61
- return SearchFieldDataType.Double
62
- # fallback to string
63
- return SearchFieldDataType.String
64
-
65
- def _build_fields(self, sample: Dict[str, Any], key_field: str):
66
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
67
-
68
- fields = []
69
- # ensure key field present
70
- if key_field not in sample:
71
- # we'll create a string key, uploader will populate unique ids
72
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
73
- else:
74
- typ = self._infer_field_type(sample[key_field])
75
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
76
-
77
- for k, v in sample.items():
78
- logger.info(f"================={k}============")
79
- if k == key_field:
80
- continue
81
- typ = self._infer_field_type(v)
82
- # for strings use SearchableField so full text queries work
83
- if typ == SearchFieldDataType.String:
84
- fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
85
- else:
86
- fields.append(SimpleField(name=k, type=typ))
87
- return fields
88
-
89
- def create_index(self, sample: Dict[str, Any]) -> bool:
90
- try:
91
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
92
- endpoint = self.config["aisearch_endpoint"]
93
- api_key = self.config["aisearch_api_key"]
94
- index_name = self.config["aisearch_index_name"]
95
- key_field = self.config.get("key_field", "id")
96
-
97
- index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
98
- fields = self._build_fields(sample, key_field)
99
- logger.info("=================Creating Index============")
100
- index = SearchIndex(name=index_name, fields=fields)
101
- # create or update index
102
- index_client.create_or_update_index(index)
103
- logger.info("Azure Search index '%s' created/updated", index_name)
104
- return True
105
- except Exception as ex:
106
- logger.exception("AzureSearchIndexer.create_index failed")
107
- return False
108
-
109
- def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
110
- try:
111
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
112
- endpoint = self.config["aisearch_endpoint"]
113
- api_key = self.config["aisearch_api_key"]
114
- index_name = self.config["aisearch_index_name"]
115
- key_field = self.config.get("key_field", "id")
116
-
117
- # ensure each doc has key_field
118
- from uuid import uuid4
119
- for d in docs:
120
- if key_field not in d:
121
- d[key_field] = str(uuid4())
122
- # ensure each doc has key_field is of string type
123
- for d in docs:
124
- if key_field in d:
125
- typ = self._infer_field_type(d[key_field])
126
- if typ != SearchFieldDataType.String:
127
- d[key_field] = str(d[key_field])
128
-
129
- client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
130
- logger.info("Uploading %d documents to index %s", len(docs), index_name)
131
- result = client.upload_documents(documents=docs)
132
- # Check results for failures
133
- failed = [r for r in result if not r.succeeded]
134
- if failed:
135
- logger.error("Some documents failed to upload: %s", failed)
136
- return False
137
- logger.info("Uploaded documents successfully")
138
- return True
139
- except Exception:
140
- logger.exception("AzureSearchIndexer.upload_documents failed")
141
- return False
142
-
143
- def index(self, rows: List[Dict[str, Any]]) -> bool:
144
- """
145
- High level: create index (based on first row) and upload all rows.
146
- """
147
- if not rows:
148
- logger.error("AzureSearchIndexer.index called with empty rows")
149
- return False
150
- try:
151
- if not self.validate_config():
152
- return False
153
- sample = rows[0]
154
- logger.info(f"================={sample}============")
155
- ok = self.create_index(sample)
156
- if not ok:
157
- return False
158
- ok2 = self.upload_documents(rows)
159
- return ok2
160
- except Exception:
161
- logger.exception("AzureSearchIndexer.index failed")
162
- return False
@@ -1,286 +0,0 @@
1
- from typing import List, Dict, Any, Optional
2
- from datasourcelib.utils.logger import get_logger
3
-
4
- logger = get_logger(__name__)
5
-
6
- class AzureSearchIndexer:
7
- """
8
- Azure Cognitive Search indexer with vector search support.
9
- Required vector_db_config:
10
- - aisearch_endpoint: str
11
- - aisearch_index_name: str
12
- - aisearch_api_key
13
-
14
- Optional vector search config:
15
- - vectorization: bool (enable vector search)
16
- - vector_config: dict
17
- - dimensions: int (default 1024)
18
- - algorithm: str ('hnsw' or 'flat', default 'hnsw')
19
- - metric: str ('cosine', 'euclidean', 'dotProduct', default 'cosine')
20
- - key_field: str (default 'id')
21
- - vector_field: str (default 'contentVector')
22
- - embedding_endpoint: str (Azure OpenAI endpoint for embeddings)
23
- - embedding_key: str (Azure OpenAI API key)
24
- - embedding_deployment: str (Azure OpenAI model deployment name)
25
- """
26
-
27
- def __init__(self, vector_db_config: Dict[str, Any]):
28
- self.config = vector_db_config or {}
29
- self._client = None
30
- self._index_client = None
31
- self._embedding_client = None
32
-
33
- def validate_config(self) -> bool:
34
- required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
35
- missing = [k for k in required if k not in self.config]
36
-
37
- # Check vector search requirements if enabled
38
- if self.config.get("vectorization", False):
39
- vector_required = ("embedding_endpoint", "embedding_key", "embedding_deployment")
40
- missing.extend([k for k in vector_required if k not in self.config])
41
-
42
- if missing:
43
- logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
44
- return False
45
- return True
46
-
47
- def _ensure_sdk(self):
48
- try:
49
- from azure.core.credentials import AzureKeyCredential # type: ignore
50
- from azure.search.documents import SearchClient # type: ignore
51
- from azure.search.documents.indexes import SearchIndexClient # type: ignore
52
- from openai import AzureOpenAI # type: ignore
53
- from azure.search.documents.indexes.models import (
54
- SearchIndex,
55
- SearchField,
56
- SearchFieldDataType,
57
- SimpleField,
58
- SearchableField,
59
- VectorSearch,
60
- VectorSearchProfile,
61
- HnswAlgorithmConfiguration
62
- ) # type: ignore
63
-
64
- except Exception as e:
65
- raise RuntimeError("Required packages missing. Install: azure-search-documents openai") from e
66
-
67
- return (
68
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration
69
- )
70
-
71
- def _setup_embedding_client(self):
72
- if not self._embedding_client and self.config.get("vectorization"):
73
- try:
74
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
75
- self._embedding_client = AzureOpenAI(
76
- api_version=self.config["embedding_api_version"],
77
- azure_endpoint=self.config["embedding_endpoint"],
78
- api_key=self.config["embedding_key"],
79
- )
80
- logger.info("Azure OpenAI embedding client initialized")
81
- except Exception as ex:
82
- logger.exception("Failed to initialize embedding client")
83
- raise
84
-
85
- def _get_embeddings(self, text: str) -> List[float]:
86
- try:
87
- self._setup_embedding_client()
88
- response = self._embedding_client.embeddings.create(
89
- model=self.config["embedding_deployment"],
90
- input=text
91
- )
92
- return response.data[0].embedding
93
- except Exception as ex:
94
- logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
95
- raise
96
-
97
- def _build_vector_search_config(self):
98
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
99
- vector_config = self.config.get("vector_config", {})
100
- dimensions = vector_config.get("dimensions", 1536)
101
-
102
- vector_search = VectorSearch(
103
- profiles=[VectorSearchProfile(name="vector-profile-1", algorithm_configuration_name="algorithms-config-1")],
104
- algorithms=[HnswAlgorithmConfiguration(name="algorithms-config-1")]
105
- )
106
-
107
- return vector_search, dimensions
108
-
109
- def _infer_field_type(self, value) -> Any:
110
- #Map Python types to SearchFieldDataType, including collections
111
-
112
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
113
-
114
- if value is None:
115
- return SearchFieldDataType.String
116
-
117
- t = type(value)
118
-
119
- # Handle list/array types as Collections
120
- if t in (list, tuple):
121
- # If empty list, default to Collection of Double
122
- if not value:
123
- return SearchFieldDataType.Collection(SearchFieldDataType.Double)
124
- # Get type of first element for non-empty lists
125
- element_type = self._infer_field_type(value[0])
126
- return SearchFieldDataType.Collection(element_type)
127
- # Handle vector embeddings (list or tuple of floats)
128
- if type(value) in (list, tuple) and all(isinstance(x, (int, float)) for x in value):
129
- return SearchFieldDataType.Collection(SearchFieldDataType.Single)
130
-
131
- # Handle basic types
132
- logger.info(f"######## Infer field type for value:[ {value} ] of type [ {t} ]")
133
- if t is bool:
134
- return SearchFieldDataType.Boolean
135
- if t is int:
136
- return SearchFieldDataType.Int32
137
- if t is float:
138
- return SearchFieldDataType.Double
139
- print(f"############## Infer field type for value: {value} of type {t}")
140
- print(t is str)
141
- if t is str:
142
- return SearchFieldDataType.String
143
- # fallback to string
144
- logger.warning(f"Falling back to string type for value: {value} of type {t}")
145
- return SearchFieldDataType.String
146
-
147
- def _build_fields(self, sample: Dict[str, Any], key_field: str):
148
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
149
-
150
- fields = []
151
- # Add key field
152
- if key_field not in sample:
153
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
154
- else:
155
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
156
-
157
- # Add regular fields
158
- for k, v in sample.items():
159
- logger.info(f"================={k}============")
160
- if k == key_field:
161
- continue
162
- logger.info(f"#### Infer field type for field: {k}")
163
- typ = self._infer_field_type(v)
164
- logger.info(f"#### Inferred type for field {k}: {typ}")
165
- if typ == SearchFieldDataType.String:
166
- fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
167
- else:
168
- fields.append(SimpleField(name=k, type=typ))
169
-
170
- # Add vector field if vectorization is enabled
171
- if self.config.get("vectorization"):
172
- vector_field = self.config.get("vector_field", "contentVector")
173
- _, dimensions = self._build_vector_search_config()
174
- fields.append(
175
- SearchField(
176
- name=vector_field,
177
- type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
178
- searchable=True,
179
- vector_search_dimensions=dimensions,
180
- vector_search_profile_name="vector-profile-1"
181
- )
182
- )
183
-
184
- return fields
185
-
186
- def create_index(self, sample: Dict[str, Any]) -> bool:
187
- try:
188
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
189
-
190
- endpoint = self.config["aisearch_endpoint"]
191
- api_key = self.config["aisearch_api_key"]
192
- index_name = self.config["aisearch_index_name"]
193
- key_field = self.config.get("key_field", "id")
194
-
195
- index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
196
- fields = self._build_fields(sample, key_field)
197
-
198
- # Create index with vector search if enabled
199
- if self.config.get("vectorization"):
200
- vector_search, _ = self._build_vector_search_config()
201
- index = SearchIndex(
202
- name=index_name,
203
- fields=fields,
204
- vector_search=vector_search
205
- )
206
- else:
207
- index = SearchIndex(name=index_name, fields=fields)
208
-
209
- index_client.create_or_update_index(index)
210
- logger.info(f"Azure Search index '{index_name}' created/updated with vectorization={self.config.get('vectorization', False)}")
211
- return True
212
- except Exception as ex:
213
- logger.exception("AzureSearchIndexer.create_index failed")
214
- return False
215
-
216
- def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
217
- try:
218
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
219
- endpoint = self.config["aisearch_endpoint"]
220
- api_key = self.config["aisearch_api_key"]
221
- index_name = self.config["aisearch_index_name"]
222
- key_field = self.config.get("key_field", "id")
223
-
224
- # Add IDs if missing
225
- from uuid import uuid4
226
- for d in docs:
227
- if key_field not in d:
228
- d[key_field] = str(uuid4())
229
- elif not isinstance(d[key_field], str):
230
- d[key_field] = str(d[key_field])
231
-
232
- # Add vector embeddings if enabled
233
- if self.config.get("vectorization"):
234
- vector_field = self.config.get("vector_field", "contentVector")
235
- content_field = self.config.get("content_field", "content")
236
-
237
- for doc in docs:
238
- if content_field in doc:
239
- try:
240
- embedding = self._get_embeddings(str(doc[content_field]))
241
- doc[vector_field] = embedding
242
- except Exception as e:
243
- logger.error(f"Failed to get embedding for document {doc.get(key_field)}: {str(e)}")
244
- continue
245
-
246
- client = SearchClient(endpoint=endpoint, index_name=index_name,
247
- credential=AzureKeyCredential(api_key))
248
-
249
- logger.info(f"Uploading {len(docs)} documents to index {index_name}")
250
- result = client.upload_documents(documents=docs)
251
-
252
- failed = [r for r in result if not r.succeeded]
253
- if failed:
254
- logger.error(f"Some documents failed to upload: {failed}")
255
- return False
256
-
257
- logger.info("Documents uploaded successfully")
258
- return True
259
-
260
- except Exception:
261
- logger.exception("AzureSearchIndexer.upload_documents failed")
262
- return False
263
-
264
- def index(self, rows: List[Dict[str, Any]]) -> bool:
265
- """High level: create index (based on first row) and upload all rows."""
266
- if not rows:
267
- logger.error("AzureSearchIndexer.index called with empty rows")
268
- return False
269
-
270
- try:
271
- if not self.validate_config():
272
- return False
273
-
274
- sample = rows[0]
275
- logger.info(f"Creating/updating index with sample: {sample}")
276
-
277
- ok = self.create_index(sample)
278
- if not ok:
279
- return False
280
-
281
- ok2 = self.upload_documents(rows)
282
- return ok2
283
-
284
- except Exception:
285
- logger.exception("AzureSearchIndexer.index failed")
286
- return False