datasourcelib 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasourcelib/datasources/azure_devops_source copy.py +126 -0
- datasourcelib/datasources/azure_devops_source.py +122 -64
- datasourcelib/datasources/sharepoint_source.py +79 -26
- datasourcelib/datasources/sql_source.py +29 -12
- datasourcelib/indexes/azure_search_index.py +102 -1
- datasourcelib/strategies/full_load.py +1 -1
- {datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/METADATA +4 -4
- {datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/RECORD +11 -12
- datasourcelib/indexes/azure_search_index_only.py +0 -162
- datasourcelib/indexes/azure_search_index_vector.py +0 -286
- {datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/WHEEL +0 -0
- {datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {datasourcelib-0.1.3.dist-info → datasourcelib-0.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
from datasourcelib.datasources.datasource_base import DataSourceBase
|
|
3
|
+
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
from datasourcelib.utils.validators import require_keys
|
|
5
|
+
import base64
|
|
6
|
+
import json
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import requests # type: ignore
|
|
13
|
+
except Exception:
|
|
14
|
+
requests = None # lazy import handled at runtime
|
|
15
|
+
|
|
16
|
+
class AzureDevOpsSource(DataSourceBase):
|
|
17
|
+
|
|
18
|
+
def validate_config(self) -> bool:
|
|
19
|
+
try:
|
|
20
|
+
require_keys(self.config, ["ado_organization", "ado_personal_access_token","ado_project","ado_query_id"])
|
|
21
|
+
return True
|
|
22
|
+
except Exception as ex:
|
|
23
|
+
logger.error("AzureDevOpsSource.validate_config: %s", ex)
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
def connect(self) -> bool:
|
|
27
|
+
if requests is None:
|
|
28
|
+
raise RuntimeError("requests package is required for AzureDevOpsSource")
|
|
29
|
+
# No persistent connection; store auth header
|
|
30
|
+
pat = self.config.get("ado_personal_access_token")
|
|
31
|
+
token = pat
|
|
32
|
+
token_b64 = base64.b64encode(token.encode("utf-8")).decode("utf-8")
|
|
33
|
+
self._headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
34
|
+
self._connected = True
|
|
35
|
+
logger.info("AzureDevOpsSource ready (no persistent connection required)")
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
def disconnect(self) -> None:
|
|
39
|
+
self._headers = {}
|
|
40
|
+
self._connected = False
|
|
41
|
+
logger.info("AzureDevOpsSource cleared")
|
|
42
|
+
|
|
43
|
+
def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
|
|
44
|
+
if requests is None:
|
|
45
|
+
raise RuntimeError("requests package is required for AzureDevOpsSource")
|
|
46
|
+
if not getattr(self, "_connected", False):
|
|
47
|
+
self.connect()
|
|
48
|
+
|
|
49
|
+
org = self.config.get("ado_organization")
|
|
50
|
+
project = self.config.get("ado_project")
|
|
51
|
+
query_id = self.config.get("ado_query_id")
|
|
52
|
+
api_version = self.config.get("api_version", "7.1")
|
|
53
|
+
#path = self.config.get("query_path", query or "")
|
|
54
|
+
if not query_id:
|
|
55
|
+
raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
|
|
56
|
+
|
|
57
|
+
base = f"https://dev.azure.com/{org}/"
|
|
58
|
+
if project:
|
|
59
|
+
base = f"{base}{project}/"
|
|
60
|
+
url = f"{base}_apis/wit/wiql/{query_id}"
|
|
61
|
+
params = {"api-version": api_version}
|
|
62
|
+
method = self.config.get("method", "GET").upper()
|
|
63
|
+
query_response = requests.request(method, url, headers=getattr(self, "_headers", {}), params=params) #, json=self.config.get("payload")
|
|
64
|
+
query_response.raise_for_status()
|
|
65
|
+
#data = resp.json()
|
|
66
|
+
# Check if the request was successful
|
|
67
|
+
if query_response.status_code == 200:
|
|
68
|
+
work_items = query_response.json()['workItems']
|
|
69
|
+
work_item_details = []
|
|
70
|
+
|
|
71
|
+
# Loop through each work item ID to get detailed information
|
|
72
|
+
for item in work_items:
|
|
73
|
+
work_item_id = item['id']
|
|
74
|
+
work_item_url = f'https://dev.azure.com/{org}/{project}/_apis/wit/workitems/{work_item_id}?api-version=7.1'
|
|
75
|
+
work_item_response = requests.get(work_item_url, headers=getattr(self, "_headers", {}))
|
|
76
|
+
|
|
77
|
+
if work_item_response.status_code == 200:
|
|
78
|
+
logger.info(f"Current Item: {work_item_id}")
|
|
79
|
+
text = work_item_response.json()['fields']['System.Description']
|
|
80
|
+
c_desc=BeautifulSoup(text, "html.parser").get_text()
|
|
81
|
+
c_changedate = work_item_response.json()['fields']['System.ChangedDate']
|
|
82
|
+
c_title = work_item_response.json()['fields']['System.Title']
|
|
83
|
+
c_status = work_item_response.json()['fields']['System.State']
|
|
84
|
+
c_type = work_item_response.json()['fields']['System.WorkItemType']
|
|
85
|
+
c_created = work_item_response.json()['fields']['System.CreatedDate']
|
|
86
|
+
|
|
87
|
+
default_value = "-VALUE NOT ASSIGNED-"
|
|
88
|
+
c_assigned = work_item_response.json()['fields'].get('System.AssignedTo',{}).get('displayName',default_value)
|
|
89
|
+
logger.info(c_assigned)
|
|
90
|
+
c_tags = work_item_response.json()['fields'].get('System.Tags',default_value)
|
|
91
|
+
c_project = work_item_response.json()['fields'].get('Custom.ProjectName',default_value)
|
|
92
|
+
c_rtype = work_item_response.json()['fields'].get('Custom.Releasetype',default_value)
|
|
93
|
+
c_rdate = work_item_response.json()['fields'].get('Microsoft.VSTS.Scheduling.TargetDate',default_value)
|
|
94
|
+
|
|
95
|
+
#fullfeature = f"{c_type} ID {work_item_id} was created on {c_created} for a {c_rtype} release of Project '{c_project}' with target date '{c_rdate}' and has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned} and last modified on {c_changedate}.Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
|
|
96
|
+
fullfeature = f"{c_type} ID {work_item_id} was created on {c_created}. {c_type} ID {work_item_id} is a {c_rtype} release of Project '{c_project}'. {c_type} ID {work_item_id} Release has target date '{c_rdate}'.{c_type} ID {work_item_id} has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned}. {c_type} ID {work_item_id} is last modified on {c_changedate}. Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
|
|
97
|
+
# Ensure work_item_details is a list and append a dict for this work item
|
|
98
|
+
|
|
99
|
+
work_item_details.append({
|
|
100
|
+
"id": work_item_id,
|
|
101
|
+
"type": c_type,
|
|
102
|
+
"title": c_title,
|
|
103
|
+
"status": c_status,
|
|
104
|
+
"assigned_to": c_assigned,
|
|
105
|
+
"created": c_created,
|
|
106
|
+
"changed_date": c_changedate,
|
|
107
|
+
"tags": c_tags,
|
|
108
|
+
"release_type": c_rtype,
|
|
109
|
+
"target_date": c_rdate,
|
|
110
|
+
"project": c_project,
|
|
111
|
+
"description": c_desc,
|
|
112
|
+
"full": fullfeature
|
|
113
|
+
})
|
|
114
|
+
else:
|
|
115
|
+
logger.error(f"Error fetching details for work item ID {work_item_id}: {work_item_response.status_code}")
|
|
116
|
+
|
|
117
|
+
#work_item_desc = []
|
|
118
|
+
#for desc in work_item_details:
|
|
119
|
+
# work_item_desc.append(desc['fields']['System.Description'])
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
return work_item_details #[{"response": json.dumps(work_item_details)}]
|
|
123
|
+
else:
|
|
124
|
+
raise RuntimeError(f"Error: {query_response.status_code}")
|
|
125
|
+
# Caller decides how to interpret the payload; default: return raw json in a single-item list
|
|
126
|
+
|
|
@@ -50,77 +50,135 @@ class AzureDevOpsSource(DataSourceBase):
|
|
|
50
50
|
project = self.config.get("ado_project")
|
|
51
51
|
query_id = self.config.get("ado_query_id")
|
|
52
52
|
api_version = self.config.get("api_version", "7.1")
|
|
53
|
-
#path = self.config.get("query_path", query or "")
|
|
54
53
|
if not query_id:
|
|
55
54
|
raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
|
|
56
55
|
|
|
57
56
|
base = f"https://dev.azure.com/{org}/"
|
|
58
57
|
if project:
|
|
59
58
|
base = f"{base}{project}/"
|
|
60
|
-
|
|
59
|
+
# WIQL query by id (returns list of work item refs)
|
|
60
|
+
wiql_url = f"{base}_apis/wit/wiql/{query_id}"
|
|
61
61
|
params = {"api-version": api_version}
|
|
62
62
|
method = self.config.get("method", "GET").upper()
|
|
63
|
-
query_response = requests.request(method,
|
|
63
|
+
query_response = requests.request(method, wiql_url, headers=getattr(self, "_headers", {}), params=params)
|
|
64
64
|
query_response.raise_for_status()
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
if query_response.status_code == 200:
|
|
68
|
-
work_items = query_response.json()['workItems']
|
|
69
|
-
work_item_details = []
|
|
70
|
-
|
|
71
|
-
# Loop through each work item ID to get detailed information
|
|
72
|
-
for item in work_items:
|
|
73
|
-
work_item_id = item['id']
|
|
74
|
-
work_item_url = f'https://dev.azure.com/{org}/{project}/_apis/wit/workitems/{work_item_id}?api-version=7.1'
|
|
75
|
-
work_item_response = requests.get(work_item_url, headers=getattr(self, "_headers", {}))
|
|
76
|
-
|
|
77
|
-
if work_item_response.status_code == 200:
|
|
78
|
-
logger.info(f"Current Item: {work_item_id}")
|
|
79
|
-
text = work_item_response.json()['fields']['System.Description']
|
|
80
|
-
c_desc=BeautifulSoup(text, "html.parser").get_text()
|
|
81
|
-
c_changedate = work_item_response.json()['fields']['System.ChangedDate']
|
|
82
|
-
c_title = work_item_response.json()['fields']['System.Title']
|
|
83
|
-
c_status = work_item_response.json()['fields']['System.State']
|
|
84
|
-
c_type = work_item_response.json()['fields']['System.WorkItemType']
|
|
85
|
-
c_created = work_item_response.json()['fields']['System.CreatedDate']
|
|
86
|
-
|
|
87
|
-
default_value = "-VALUE NOT ASSIGNED-"
|
|
88
|
-
c_assigned = work_item_response.json()['fields'].get('System.AssignedTo',{}).get('displayName',default_value)
|
|
89
|
-
logger.info(c_assigned)
|
|
90
|
-
c_tags = work_item_response.json()['fields'].get('System.Tags',default_value)
|
|
91
|
-
c_project = work_item_response.json()['fields'].get('Custom.ProjectName',default_value)
|
|
92
|
-
c_rtype = work_item_response.json()['fields'].get('Custom.Releasetype',default_value)
|
|
93
|
-
c_rdate = work_item_response.json()['fields'].get('Microsoft.VSTS.Scheduling.TargetDate',default_value)
|
|
94
|
-
|
|
95
|
-
#fullfeature = f"{c_type} ID {work_item_id} was created on {c_created} for a {c_rtype} release of Project '{c_project}' with target date '{c_rdate}' and has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned} and last modified on {c_changedate}.Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
|
|
96
|
-
fullfeature = f"{c_type} ID {work_item_id} was created on {c_created}. {c_type} ID {work_item_id} is a {c_rtype} release of Project '{c_project}'. {c_type} ID {work_item_id} Release has target date '{c_rdate}'.{c_type} ID {work_item_id} has given Title as '{c_title}'. {c_type} ID {work_item_id} is currently in {c_status} state. {c_type} ID {work_item_id} is assigned to {c_assigned}. {c_type} ID {work_item_id} is last modified on {c_changedate}. Tags Applied to {c_type} ID {work_item_id} are {c_tags}. Full Description of {c_type} ID {work_item_id} is [{c_desc}]."
|
|
97
|
-
# Ensure work_item_details is a list and append a dict for this work item
|
|
98
|
-
|
|
99
|
-
work_item_details.append({
|
|
100
|
-
"id": work_item_id,
|
|
101
|
-
"type": c_type,
|
|
102
|
-
"title": c_title,
|
|
103
|
-
"status": c_status,
|
|
104
|
-
"assigned_to": c_assigned,
|
|
105
|
-
"created": c_created,
|
|
106
|
-
"changed_date": c_changedate,
|
|
107
|
-
"tags": c_tags,
|
|
108
|
-
"release_type": c_rtype,
|
|
109
|
-
"target_date": c_rdate,
|
|
110
|
-
"project": c_project,
|
|
111
|
-
"description": c_desc,
|
|
112
|
-
"full": fullfeature
|
|
113
|
-
})
|
|
114
|
-
else:
|
|
115
|
-
logger.error(f"Error fetching details for work item ID {work_item_id}: {work_item_response.status_code}")
|
|
116
|
-
|
|
117
|
-
#work_item_desc = []
|
|
118
|
-
#for desc in work_item_details:
|
|
119
|
-
# work_item_desc.append(desc['fields']['System.Description'])
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
return work_item_details #[{"response": json.dumps(work_item_details)}]
|
|
123
|
-
else:
|
|
65
|
+
|
|
66
|
+
if query_response.status_code != 200:
|
|
124
67
|
raise RuntimeError(f"Error: {query_response.status_code}")
|
|
125
|
-
|
|
126
|
-
|
|
68
|
+
|
|
69
|
+
work_items_refs = query_response.json().get('workItems', []) or []
|
|
70
|
+
if not work_items_refs:
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
# collect ids and fetch details in batch to get all fields for all work item types
|
|
74
|
+
ids = [str(item.get('id')) for item in work_items_refs if item.get('id')]
|
|
75
|
+
if not ids:
|
|
76
|
+
return []
|
|
77
|
+
|
|
78
|
+
details_url = f"https://dev.azure.com/{org}/{project}/_apis/wit/workitems"
|
|
79
|
+
# expand=all to include fields, relations, and attachments
|
|
80
|
+
params = {
|
|
81
|
+
"ids": ",".join(ids),
|
|
82
|
+
"api-version": api_version,
|
|
83
|
+
"$expand": "all"
|
|
84
|
+
}
|
|
85
|
+
details_resp = requests.get(details_url, headers=getattr(self, "_headers", {}), params=params)
|
|
86
|
+
details_resp.raise_for_status()
|
|
87
|
+
items = details_resp.json().get("value", [])
|
|
88
|
+
|
|
89
|
+
work_item_details: List[Dict[str, Any]] = []
|
|
90
|
+
for item in items:
|
|
91
|
+
item_id = item.get("id")
|
|
92
|
+
fields = item.get("fields", {}) or {}
|
|
93
|
+
|
|
94
|
+
# Normalize field keys to safe snake_case-like keys
|
|
95
|
+
norm_fields: Dict[str, Any] = {}
|
|
96
|
+
for k, v in fields.items():
|
|
97
|
+
nk = k.replace(".", "_")
|
|
98
|
+
nk = nk.lower()
|
|
99
|
+
norm_fields[nk] = v
|
|
100
|
+
|
|
101
|
+
# Helper to safely extract nested displayName for assigned to
|
|
102
|
+
assigned = norm_fields.get("system_assignedto")
|
|
103
|
+
if isinstance(assigned, dict):
|
|
104
|
+
assigned_to = assigned.get("displayName") or assigned.get("uniqueName") or str(assigned)
|
|
105
|
+
else:
|
|
106
|
+
assigned_to = assigned
|
|
107
|
+
|
|
108
|
+
# find a description-like field (some types use different field names)
|
|
109
|
+
desc = ""
|
|
110
|
+
for fk in ["system_description", "microsoft_vsts_createdby", "html_description"]:
|
|
111
|
+
if fk in norm_fields:
|
|
112
|
+
desc = norm_fields.get(fk) or ""
|
|
113
|
+
break
|
|
114
|
+
if not desc:
|
|
115
|
+
# fallback: first field key that contains 'description'
|
|
116
|
+
for kf, vf in norm_fields.items():
|
|
117
|
+
if "description" in kf and vf:
|
|
118
|
+
desc = vf
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
# clean HTML description to text
|
|
122
|
+
try:
|
|
123
|
+
c_desc = BeautifulSoup(desc or "", "html.parser").get_text()
|
|
124
|
+
except Exception:
|
|
125
|
+
c_desc = desc or ""
|
|
126
|
+
|
|
127
|
+
# Build common convenience values (use available fields)
|
|
128
|
+
wi_type = norm_fields.get("system_workitemtype") or norm_fields.get("system_witype") or ""
|
|
129
|
+
title = norm_fields.get("system_title") or ""
|
|
130
|
+
status = norm_fields.get("system_state") or ""
|
|
131
|
+
created = norm_fields.get("system_createddate") or norm_fields.get("system_created") or ""
|
|
132
|
+
changed = norm_fields.get("system_changeddate") or norm_fields.get("system_changed") or ""
|
|
133
|
+
tags = norm_fields.get("system_tags", "")
|
|
134
|
+
project_name = norm_fields.get("custom.projectname") or norm_fields.get("system_teamproject") or ""
|
|
135
|
+
|
|
136
|
+
rtype = norm_fields.get("custom.releasetype") or norm_fields.get("custom_releasetype") or ""
|
|
137
|
+
target_date = norm_fields.get("microsoft_vsts_scheduling_targetdate") or norm_fields.get("microsoft.vsts.scheduling.targetdate") or ""
|
|
138
|
+
|
|
139
|
+
# Construct a 'full' description string using available pieces
|
|
140
|
+
parts = []
|
|
141
|
+
if wi_type:
|
|
142
|
+
parts.append(f"{wi_type} ID {item_id}")
|
|
143
|
+
else:
|
|
144
|
+
parts.append(f"WorkItem {item_id}")
|
|
145
|
+
if created:
|
|
146
|
+
parts.append(f"was created on {created}")
|
|
147
|
+
if title:
|
|
148
|
+
parts.append(f"and has Title '{title}'")
|
|
149
|
+
if status:
|
|
150
|
+
parts.append(f"is currently in {status} state")
|
|
151
|
+
if assigned_to:
|
|
152
|
+
parts.append(f"is assigned to {assigned_to}")
|
|
153
|
+
if project_name:
|
|
154
|
+
parts.append(f"for Project '{project_name}'")
|
|
155
|
+
if rtype:
|
|
156
|
+
parts.append(f"release type '{rtype}'")
|
|
157
|
+
if target_date:
|
|
158
|
+
parts.append(f"with target date '{target_date}'")
|
|
159
|
+
if tags:
|
|
160
|
+
parts.append(f"Tags: {tags}")
|
|
161
|
+
if c_desc:
|
|
162
|
+
parts.append(f"Description: [{c_desc}]")
|
|
163
|
+
fullfeature = ". ".join(parts)
|
|
164
|
+
|
|
165
|
+
# include all normalized fields in the returned object for completeness
|
|
166
|
+
entry = {
|
|
167
|
+
"id": item_id,
|
|
168
|
+
"type": wi_type,
|
|
169
|
+
"title": title,
|
|
170
|
+
"status": status,
|
|
171
|
+
"assigned_to": assigned_to,
|
|
172
|
+
"created": created,
|
|
173
|
+
"changed_date": changed,
|
|
174
|
+
"tags": tags,
|
|
175
|
+
"project": project_name,
|
|
176
|
+
"release_type": rtype,
|
|
177
|
+
"target_date": target_date,
|
|
178
|
+
"description": c_desc,
|
|
179
|
+
"full": fullfeature,
|
|
180
|
+
"fields": norm_fields # full field set for this work item
|
|
181
|
+
}
|
|
182
|
+
work_item_details.append(entry)
|
|
183
|
+
|
|
184
|
+
return work_item_details
|
|
@@ -7,6 +7,7 @@ import requests
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import os
|
|
9
9
|
from uuid import uuid4
|
|
10
|
+
from datetime import datetime, timedelta
|
|
10
11
|
|
|
11
12
|
logger = get_logger(__name__)
|
|
12
13
|
reader = ByteReader()
|
|
@@ -114,50 +115,86 @@ class SharePointSource(DataSourceBase):
|
|
|
114
115
|
self._drive_id = drives[0].get("id")
|
|
115
116
|
logger.info("Resolved SharePoint drive ID: %s", self._drive_id)
|
|
116
117
|
|
|
118
|
+
def _get_client_credentials(self) -> Tuple[str, str]:
|
|
119
|
+
"""Retrieve client credentials in order of priority: sp_download_config, sp_client_config, sp_master_config."""
|
|
120
|
+
# Fallback to sp_client_config
|
|
121
|
+
sp_client_config = self.config.get("sp_client_config", {})
|
|
122
|
+
client_id = sp_client_config.get("sp_client_id")
|
|
123
|
+
client_secret = sp_client_config.get("sp_client_secret")
|
|
124
|
+
|
|
125
|
+
if not client_id or not client_secret:
|
|
126
|
+
# Fallback to sp_master_config
|
|
127
|
+
sp_master_config = self.config.get("sp_master_config", {})
|
|
128
|
+
client_id = client_id or sp_master_config.get("sp_client_id")
|
|
129
|
+
client_secret = client_secret or sp_master_config.get("sp_client_secret")
|
|
130
|
+
|
|
131
|
+
if not client_id or not client_secret:
|
|
132
|
+
raise ValueError("Client ID and Client Secret must be provided in the configuration.")
|
|
133
|
+
|
|
134
|
+
return client_id, client_secret
|
|
135
|
+
|
|
136
|
+
def _get_download_credentials(self) -> Tuple[str, str]:
|
|
137
|
+
"""Retrieve client credentials in order of priority: sp_download_config, sp_client_config, sp_master_config."""
|
|
138
|
+
# Check sp_download_config first
|
|
139
|
+
sp_download_config = self.config.get("sp_client_config", {}).get("sp_download_config", {})
|
|
140
|
+
client_id = sp_download_config.get("sp_client_id")
|
|
141
|
+
client_secret = sp_download_config.get("sp_client_secret")
|
|
142
|
+
|
|
143
|
+
if not client_id or not client_secret:
|
|
144
|
+
# Fallback to sp_client_config
|
|
145
|
+
sp_client_config = self.config.get("sp_client_config", {})
|
|
146
|
+
client_id = client_id or sp_client_config.get("sp_client_id")
|
|
147
|
+
client_secret = client_secret or sp_client_config.get("sp_client_secret")
|
|
148
|
+
|
|
149
|
+
if not client_id or not client_secret:
|
|
150
|
+
# Fallback to sp_master_config
|
|
151
|
+
sp_master_config = self.config.get("sp_master_config", {})
|
|
152
|
+
client_id = client_id or sp_master_config.get("sp_client_id")
|
|
153
|
+
client_secret = client_secret or sp_master_config.get("sp_client_secret")
|
|
154
|
+
|
|
155
|
+
if not client_id or not client_secret:
|
|
156
|
+
raise ValueError("Client ID and Client Secret must be provided in the configuration.")
|
|
157
|
+
|
|
158
|
+
return client_id, client_secret
|
|
159
|
+
|
|
160
|
+
|
|
117
161
|
def connect(self) -> bool:
|
|
118
162
|
try:
|
|
119
163
|
# basic values
|
|
120
164
|
self._site_url = self.config["sp_site_url"]
|
|
121
|
-
client_config = self.config["sp_client_config"]
|
|
122
165
|
master_config = self.config["sp_master_config"]
|
|
123
166
|
|
|
124
167
|
# get master token (Sites.Read.All)
|
|
125
168
|
try:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
)
|
|
169
|
+
master_client_id = master_config["sp_client_id"]
|
|
170
|
+
master_client_secret = master_config["sp_client_secret"]
|
|
171
|
+
self._master_token = self._get_token(master_client_id, master_client_secret, master_config["sp_tenant_id"])
|
|
129
172
|
logger.info("$$$ - Obtained master access token for SharePoint - $$$")
|
|
130
173
|
except Exception as ex:
|
|
131
174
|
logger.info("$$$ - Failed to obtain master token - $$$")
|
|
132
175
|
|
|
133
176
|
# resolve site and drive ids
|
|
134
177
|
try:
|
|
135
|
-
self._resolve_site_and_drive(
|
|
136
|
-
self.config['sp_site_display_name']
|
|
137
|
-
)
|
|
178
|
+
self._resolve_site_and_drive(self.config['sp_site_display_name'])
|
|
138
179
|
except Exception:
|
|
139
180
|
logger.info("$$$ - Failed to resolve site/drive - $$$")
|
|
140
|
-
|
|
181
|
+
|
|
141
182
|
# get client token (Site.Selected) for download operations
|
|
142
183
|
try:
|
|
143
|
-
|
|
144
|
-
self._access_token = self._get_token(
|
|
145
|
-
client_config["sp_client_id"], client_config["sp_client_secret"], master_config["sp_tenant_id"]
|
|
146
|
-
)
|
|
184
|
+
client_id, client_secret = self._get_client_credentials()
|
|
185
|
+
self._access_token = self._get_token(client_id, client_secret, master_config["sp_tenant_id"])
|
|
147
186
|
logger.info("$$$ - Obtained client access token for SharePoint downloads - $$$")
|
|
148
187
|
except Exception:
|
|
149
188
|
logger.info("$$$ - Failed to obtain client access token - $$$")
|
|
150
|
-
|
|
189
|
+
|
|
151
190
|
# get list client token (Site.Selected) for list operations
|
|
152
191
|
try:
|
|
153
|
-
|
|
154
|
-
self._list_token = self._get_list_token(
|
|
155
|
-
client_config["sp_client_id"], client_config["sp_client_secret"], master_config["sp_tenant_id"],master_config["sp_domain_name"]
|
|
156
|
-
)
|
|
192
|
+
client_id, client_secret = self._get_client_credentials()
|
|
193
|
+
self._list_token = self._get_list_token(client_id, client_secret, master_config["sp_tenant_id"], master_config["sp_domain_name"])
|
|
157
194
|
logger.info("$$$ - Obtained client list token for SharePoint list operations - $$$")
|
|
158
195
|
except Exception:
|
|
159
196
|
logger.info("$$$ - Failed to obtain client list token - $$$")
|
|
160
|
-
|
|
197
|
+
|
|
161
198
|
self._connected = True
|
|
162
199
|
logger.info("SharePointSource connected for site: %s", self._site_url)
|
|
163
200
|
return True
|
|
@@ -324,10 +361,9 @@ class SharePointSource(DataSourceBase):
|
|
|
324
361
|
results = []
|
|
325
362
|
items = self._fetch_list_items_via_rest(relative_path)
|
|
326
363
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
)
|
|
364
|
+
client_id, client_secret = self._get_download_credentials()
|
|
365
|
+
|
|
366
|
+
self._access_token = self._get_token(client_id, client_secret, self.config.get("sp_master_config",{})["sp_tenant_id"])
|
|
331
367
|
#test running with hardcoded items
|
|
332
368
|
if False:
|
|
333
369
|
items = []
|
|
@@ -350,12 +386,29 @@ class SharePointSource(DataSourceBase):
|
|
|
350
386
|
})
|
|
351
387
|
|
|
352
388
|
for item in items:
|
|
353
|
-
|
|
354
|
-
|
|
389
|
+
#the path after [Shared Documents/] in relative path
|
|
390
|
+
item_relative_path = item.get("RelativePath") or item.get("relativepath") or item.get("relativePath")
|
|
391
|
+
item_name = item.get("Title") or item.get("title")
|
|
392
|
+
item_display_name = item.get("SiteDisplayName") or item.get("sitedisplayname") or item.get("siteDisplayName")
|
|
393
|
+
|
|
394
|
+
# Check ModifiedDate filter
|
|
395
|
+
# "2024-01-15" → 10 chars || "20240115" → 8 chars
|
|
396
|
+
modified_date_str = item.get("ModifiedDate") or item.get("modifieddate") or item.get("modifiedDate")
|
|
397
|
+
if modified_date_str:
|
|
398
|
+
try:
|
|
399
|
+
modified_date = datetime.fromisoformat(modified_date_str.replace('Z', '+00:00'))
|
|
400
|
+
if datetime.now(modified_date.tzinfo) - modified_date < timedelta(days=1):
|
|
401
|
+
continue
|
|
402
|
+
except Exception:
|
|
403
|
+
pass
|
|
404
|
+
|
|
405
|
+
if not item_relative_path:
|
|
406
|
+
logger.warning("Item missing RelativePath: %s", item)
|
|
407
|
+
continue
|
|
408
|
+
|
|
355
409
|
#get site id and drive id for this item
|
|
356
410
|
self._resolve_site_and_drive(item_display_name)
|
|
357
|
-
|
|
358
|
-
item_relative_path = item.get("RelativePath")
|
|
411
|
+
|
|
359
412
|
try:
|
|
360
413
|
content, filename = self._download_file_bytes(item_relative_path)
|
|
361
414
|
saved = self._save_file_if_requested(content, filename, save_path)
|
|
@@ -16,8 +16,16 @@ class SQLDataSource(DataSourceBase):
|
|
|
16
16
|
self._is_sqlite = False
|
|
17
17
|
|
|
18
18
|
def validate_config(self) -> bool:
|
|
19
|
+
"""
|
|
20
|
+
Validate config. If sql_windows_auth is True then sql_username/sql_password are optional.
|
|
21
|
+
Otherwise require sql_username and sql_password.
|
|
22
|
+
"""
|
|
19
23
|
try:
|
|
20
|
-
|
|
24
|
+
# Always require server/database at minimum
|
|
25
|
+
require_keys(self.config, ["sql_server", "sql_database"])
|
|
26
|
+
# If not using Windows authentication, require credentials
|
|
27
|
+
if not bool(self.config.get("sql_windows_auth", False)):
|
|
28
|
+
require_keys(self.config, ["sql_username", "sql_password"])
|
|
21
29
|
return True
|
|
22
30
|
except Exception as ex:
|
|
23
31
|
logger.error("SQLDataSource.validate_config: %s", ex)
|
|
@@ -27,22 +35,31 @@ class SQLDataSource(DataSourceBase):
|
|
|
27
35
|
try:
|
|
28
36
|
sql_server = self.config.get("sql_server", "")
|
|
29
37
|
sql_database = self.config.get("sql_database", "")
|
|
30
|
-
sql_username = self.config.get("sql_username", "")
|
|
31
|
-
sql_password = self.config.get("sql_password", "")
|
|
32
38
|
sql_is_onprem = self.config.get("sql_is_onprem", False)
|
|
33
|
-
|
|
39
|
+
|
|
40
|
+
# Determine auth mode: sql_windows_auth (Trusted Connection) overrides username/password
|
|
41
|
+
sql_windows_auth = bool(self.config.get("sql_windows_auth", False))
|
|
42
|
+
|
|
34
43
|
# Get available driver
|
|
35
44
|
sql_driver = self._get_available_driver()
|
|
36
|
-
|
|
37
|
-
# Build connection string
|
|
45
|
+
|
|
46
|
+
# Build connection string
|
|
38
47
|
conn_params = [
|
|
39
48
|
f'DRIVER={sql_driver}',
|
|
40
49
|
f'SERVER={sql_server}',
|
|
41
50
|
f'DATABASE={sql_database}',
|
|
42
|
-
f'UID={sql_username}',
|
|
43
|
-
f'PWD={sql_password}'
|
|
44
51
|
]
|
|
45
|
-
|
|
52
|
+
|
|
53
|
+
if sql_windows_auth:
|
|
54
|
+
# Use integrated Windows authentication (Trusted Connection)
|
|
55
|
+
# This will use the current process credentials / kerberos ticket.
|
|
56
|
+
conn_params.append('Trusted_Connection=yes')
|
|
57
|
+
logger.info("SQLDataSource using Windows (integrated) authentication")
|
|
58
|
+
else:
|
|
59
|
+
sql_username = self.config.get("sql_username", "")
|
|
60
|
+
sql_password = self.config.get("sql_password", "")
|
|
61
|
+
conn_params.extend([f'UID={sql_username}', f'PWD={sql_password}'])
|
|
62
|
+
|
|
46
63
|
# Add encryption settings based on environment
|
|
47
64
|
if not sql_is_onprem:
|
|
48
65
|
conn_params.extend([
|
|
@@ -56,13 +73,13 @@ class SQLDataSource(DataSourceBase):
|
|
|
56
73
|
])
|
|
57
74
|
|
|
58
75
|
conn_str = ';'.join(conn_params)
|
|
59
|
-
|
|
76
|
+
|
|
60
77
|
# Attempt connection with timeout
|
|
61
78
|
self._conn = pyodbc.connect(conn_str, timeout=30)
|
|
62
79
|
self._connected = True
|
|
63
|
-
logger.info("SQLDataSource connected to %s using driver %s", sql_server, sql_driver)
|
|
80
|
+
logger.info("SQLDataSource connected to %s using driver %s (sql_windows_auth=%s)", sql_server, sql_driver, sql_windows_auth)
|
|
64
81
|
return True
|
|
65
|
-
|
|
82
|
+
|
|
66
83
|
except pyodbc.Error as ex:
|
|
67
84
|
logger.error("SQLDataSource.connect failed - ODBC Error: %s", ex)
|
|
68
85
|
self._connected = False
|
|
@@ -110,7 +110,7 @@ class AzureSearchIndexer:
|
|
|
110
110
|
logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
|
|
111
111
|
raise
|
|
112
112
|
|
|
113
|
-
def
|
|
113
|
+
def _build_vector_search_config_old(self):
|
|
114
114
|
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
|
|
115
115
|
vector_config = self.config.get("vector_config", {})
|
|
116
116
|
dimensions = vector_config.get("dimensions", 1536)
|
|
@@ -121,6 +121,107 @@ class AzureSearchIndexer:
|
|
|
121
121
|
)
|
|
122
122
|
|
|
123
123
|
return vector_search, dimensions
|
|
124
|
+
|
|
125
|
+
def _build_vector_search_config(self):
|
|
126
|
+
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
|
|
127
|
+
|
|
128
|
+
vector_config = self.config.get("vector_config", {})
|
|
129
|
+
dimensions = vector_config.get("dimensions", 1536)
|
|
130
|
+
algorithm = vector_config.get("algorithm", "hnsw").lower()
|
|
131
|
+
|
|
132
|
+
# Build algorithm configuration (SDK model if available)
|
|
133
|
+
alg_cfg = HnswAlgorithmConfiguration(name="algorithms-config-1")
|
|
134
|
+
|
|
135
|
+
# Build vectorizer settings using Azure OpenAI config from vector_db_config
|
|
136
|
+
deployment = self.config.get("embedding_deployment")
|
|
137
|
+
endpoint = self.config.get("embedding_endpoint")
|
|
138
|
+
api_key = self.config.get("embedding_key")
|
|
139
|
+
# modelName required for API version 2025-09-01 — prefer explicit embedding_model, fall back to deployment
|
|
140
|
+
model_name = self.config.get("embedding_model") or deployment
|
|
141
|
+
content_field = self.config.get("content_field", "content")
|
|
142
|
+
vector_field = self.config.get("vector_field", "contentVector")
|
|
143
|
+
|
|
144
|
+
if not model_name:
|
|
145
|
+
raise RuntimeError("Vectorizer configuration requires 'embedding_model' or 'embedding_deployment' in vector_db_config")
|
|
146
|
+
|
|
147
|
+
# Define vectorizer with explicit name and required azureOpenAIParameters including modelName
|
|
148
|
+
vectorizer_name = "azure-openai-vectorizer"
|
|
149
|
+
vectorizer = {
|
|
150
|
+
"name": vectorizer_name,
|
|
151
|
+
"kind": "azureOpenAI",
|
|
152
|
+
"azureOpenAIParameters": {
|
|
153
|
+
"resourceUri": endpoint.rstrip('/') if endpoint else None,
|
|
154
|
+
# include both modelName (required) and deploymentId (if provided)
|
|
155
|
+
"modelName": model_name,
|
|
156
|
+
**({"deploymentId": deployment} if deployment else {}),
|
|
157
|
+
"apiKey": api_key
|
|
158
|
+
},
|
|
159
|
+
"options": {
|
|
160
|
+
"fieldMapping": [
|
|
161
|
+
{
|
|
162
|
+
"sourceContext": f"/document/{content_field}",
|
|
163
|
+
"outputs": [
|
|
164
|
+
{
|
|
165
|
+
"targetContext": f"/document/{vector_field}",
|
|
166
|
+
"targetDimensions": dimensions
|
|
167
|
+
}
|
|
168
|
+
]
|
|
169
|
+
}
|
|
170
|
+
]
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
profile_name = "vector-profile-1"
|
|
175
|
+
try:
|
|
176
|
+
# Create profile with vectorizer reference (SDK may expect vectorizer_name or vectorizer depending on version)
|
|
177
|
+
try:
|
|
178
|
+
profile = VectorSearchProfile(
|
|
179
|
+
name=profile_name,
|
|
180
|
+
algorithm_configuration_name="algorithms-config-1",
|
|
181
|
+
vectorizer_name=vectorizer_name
|
|
182
|
+
)
|
|
183
|
+
except TypeError:
|
|
184
|
+
# fallback if SDK constructor uses different parameter names
|
|
185
|
+
profile = VectorSearchProfile(name=profile_name, algorithm_configuration_name="algorithms-config-1")
|
|
186
|
+
try:
|
|
187
|
+
setattr(profile, "vectorizer_name", vectorizer_name)
|
|
188
|
+
except Exception:
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
# Construct full vector search config with both profile and vectorizer
|
|
193
|
+
vector_search = VectorSearch(
|
|
194
|
+
profiles=[profile],
|
|
195
|
+
algorithms=[alg_cfg],
|
|
196
|
+
vectorizers=[vectorizer]
|
|
197
|
+
)
|
|
198
|
+
except Exception:
|
|
199
|
+
# Fallback to dict if SDK constructor differs
|
|
200
|
+
vector_search = {
|
|
201
|
+
"profiles": [{
|
|
202
|
+
"name": profile_name,
|
|
203
|
+
"algorithmConfigurationName": "algorithms-config-1",
|
|
204
|
+
"vectorizerName": vectorizer_name
|
|
205
|
+
}],
|
|
206
|
+
"algorithms": [{"name": "algorithms-config-1"}],
|
|
207
|
+
"vectorizers": [vectorizer]
|
|
208
|
+
}
|
|
209
|
+
except Exception:
|
|
210
|
+
# Full dict fallback
|
|
211
|
+
vector_search = {
|
|
212
|
+
"profiles": [{
|
|
213
|
+
"name": profile_name,
|
|
214
|
+
"algorithmConfigurationName": "algorithms-config-1",
|
|
215
|
+
"vectorizerName": vectorizer_name
|
|
216
|
+
}],
|
|
217
|
+
"algorithms": [{"name": "algorithms-config-1"}],
|
|
218
|
+
"vectorizers": [vectorizer]
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
logger.info("Built vector_search config (dimensions=%s, model=%s, vectorizer=%s)",
|
|
222
|
+
dimensions, model_name, vectorizer_name)
|
|
223
|
+
return vector_search, dimensions
|
|
224
|
+
|
|
124
225
|
|
|
125
226
|
def _build_semantic_settings(self):
|
|
126
227
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datasourcelib.core.sync_base import SyncBase
|
|
2
2
|
from datasourcelib.utils.logger import get_logger
|
|
3
|
-
from datasourcelib.indexes.
|
|
3
|
+
from datasourcelib.indexes.azure_search_index import AzureSearchIndexer
|
|
4
4
|
logger = get_logger(__name__)
|
|
5
5
|
|
|
6
6
|
class FullLoadStrategy(SyncBase):
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datasourcelib
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Data source sync strategies for vector DBs
|
|
5
|
-
Home-page: https://github.com/
|
|
6
|
-
Author:
|
|
7
|
-
Author-email:
|
|
5
|
+
Home-page: https://github.com/akashmaurya0217/datasourcelib
|
|
6
|
+
Author: Akash Kumar Maurya
|
|
7
|
+
Author-email: mrelectronicsarduino@gmail.com
|
|
8
8
|
Classifier: Development Status :: 3 - Alpha
|
|
9
9
|
Classifier: Intended Audience :: Developers
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -4,20 +4,19 @@ datasourcelib/core/sync_base.py,sha256=AfwwaV3rJOFKVmKKpSj-BwznnCDCaeuT4LLNDfA3N
|
|
|
4
4
|
datasourcelib/core/sync_manager.py,sha256=lj070S3PwSNcB0UL_ZDzDAm6uJ9G38TY491vQZ1dL3o,3849
|
|
5
5
|
datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
|
|
6
6
|
datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
|
|
7
|
-
datasourcelib/datasources/azure_devops_source.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
|
|
7
|
+
datasourcelib/datasources/azure_devops_source copy.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
|
|
8
|
+
datasourcelib/datasources/azure_devops_source.py,sha256=3hyZIrUdgwZEQNjb2iZGDMJcAw3Z6r7oV0hWAq_zMsg,8005
|
|
8
9
|
datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
|
|
9
10
|
datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
|
|
10
11
|
datasourcelib/datasources/datasource_types.py,sha256=eEiWymYS05X_TxwuB7P3MpphPG1En67h3kRiSGeHjQ0,176
|
|
11
12
|
datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
|
|
12
|
-
datasourcelib/datasources/sharepoint_source.py,sha256=
|
|
13
|
-
datasourcelib/datasources/sql_source.py,sha256=
|
|
13
|
+
datasourcelib/datasources/sharepoint_source.py,sha256=t3rly2mVEI2qEDuUVqstck5ktkZW0BnF16Bke_NjPLI,23126
|
|
14
|
+
datasourcelib/datasources/sql_source.py,sha256=ntZjiFXpa7V797x7mAATJV0LH-g878VHuRw-QTxEe28,6372
|
|
14
15
|
datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
|
|
15
|
-
datasourcelib/indexes/azure_search_index.py,sha256=
|
|
16
|
-
datasourcelib/indexes/azure_search_index_only.py,sha256=SulrYPehWGaf3Wi_Dw8UvFneSY-UwEK9viVYXwIlQuI,7120
|
|
17
|
-
datasourcelib/indexes/azure_search_index_vector.py,sha256=4By1vJHv1ORiWOpTqO5wR0sTrq1TaEHP6t8MoOINhok,13410
|
|
16
|
+
datasourcelib/indexes/azure_search_index.py,sha256=kznAz06UXgyT1Clqj6gRhnBQ5HFw40ZQHJElRFIcbRo,22115
|
|
18
17
|
datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
|
|
19
18
|
datasourcelib/strategies/daily_load.py,sha256=Rh-veUhxKYsplwHTyko_Zp9C6NkUJV5VAGtg-p7Iy34,856
|
|
20
|
-
datasourcelib/strategies/full_load.py,sha256=
|
|
19
|
+
datasourcelib/strategies/full_load.py,sha256=U1a9wO_ZLRnMInvU0IRW-ZKnhu0Cv437VcNMKIYuzMA,1691
|
|
21
20
|
datasourcelib/strategies/incremental_load.py,sha256=TVqmDLu3m571nqGvzo_69i36QtYe4sBpllFwfPNL0TE,1178
|
|
22
21
|
datasourcelib/strategies/ondemand_load.py,sha256=VxzAYgrW2ebTOC3xm61CerL2AFehZUJLnKrqtGRGJoE,644
|
|
23
22
|
datasourcelib/strategies/timerange_load.py,sha256=c62BN2yXwVFaA_dQV54qenP4vrb4rcFqbx6m-nqhaTA,900
|
|
@@ -27,8 +26,8 @@ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4f
|
|
|
27
26
|
datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
|
|
28
27
|
datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
|
|
29
28
|
datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
|
|
30
|
-
datasourcelib-0.1.
|
|
31
|
-
datasourcelib-0.1.
|
|
32
|
-
datasourcelib-0.1.
|
|
33
|
-
datasourcelib-0.1.
|
|
34
|
-
datasourcelib-0.1.
|
|
29
|
+
datasourcelib-0.1.5.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
|
|
30
|
+
datasourcelib-0.1.5.dist-info/METADATA,sha256=jDGgTdya-zt_go_TpEOJNfTQUI7CsbjM4m-Fg51XdqU,1199
|
|
31
|
+
datasourcelib-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
32
|
+
datasourcelib-0.1.5.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
|
|
33
|
+
datasourcelib-0.1.5.dist-info/RECORD,,
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
from typing import List, Dict, Any, Optional
|
|
2
|
-
from datasourcelib.utils.logger import get_logger
|
|
3
|
-
|
|
4
|
-
logger = get_logger(__name__)
|
|
5
|
-
|
|
6
|
-
class AzureSearchIndexer:
|
|
7
|
-
"""
|
|
8
|
-
Minimal Azure Cognitive Search indexer wrapper.
|
|
9
|
-
Expects vector_db_config with:
|
|
10
|
-
- service_endpoint: str
|
|
11
|
-
- index_name: str
|
|
12
|
-
- api_key: str
|
|
13
|
-
Optional:
|
|
14
|
-
- key_field: name of unique key in documents (default 'id')
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
def __init__(self, vector_db_config: Dict[str, Any]):
|
|
18
|
-
self.config = vector_db_config or {}
|
|
19
|
-
self._client = None
|
|
20
|
-
self._index_client = None
|
|
21
|
-
|
|
22
|
-
def validate_config(self) -> bool:
|
|
23
|
-
required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
|
|
24
|
-
missing = [k for k in required if k not in self.config]
|
|
25
|
-
if missing:
|
|
26
|
-
logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
|
|
27
|
-
return False
|
|
28
|
-
return True
|
|
29
|
-
|
|
30
|
-
def _ensure_sdk(self):
|
|
31
|
-
try:
|
|
32
|
-
from azure.core.credentials import AzureKeyCredential # type: ignore
|
|
33
|
-
from azure.search.documents import SearchClient # type: ignore
|
|
34
|
-
from azure.search.documents.indexes import SearchIndexClient # type: ignore
|
|
35
|
-
from azure.search.documents.indexes.models import (
|
|
36
|
-
SearchIndex,
|
|
37
|
-
SimpleField,
|
|
38
|
-
SearchableField,
|
|
39
|
-
SearchFieldDataType,
|
|
40
|
-
) # type: ignore
|
|
41
|
-
except Exception as e:
|
|
42
|
-
raise RuntimeError("azure-search-documents package is required: install azure-search-documents") from e
|
|
43
|
-
|
|
44
|
-
return AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType
|
|
45
|
-
|
|
46
|
-
def _infer_field_type(self, value) -> Any:
|
|
47
|
-
"""
|
|
48
|
-
Map Python types to SearchFieldDataType
|
|
49
|
-
"""
|
|
50
|
-
*_, SearchFieldDataType = self._ensure_sdk()
|
|
51
|
-
if value is None:
|
|
52
|
-
return SearchFieldDataType.String
|
|
53
|
-
t = type(value)
|
|
54
|
-
if t is str:
|
|
55
|
-
return SearchFieldDataType.String
|
|
56
|
-
if t is bool:
|
|
57
|
-
return SearchFieldDataType.Boolean
|
|
58
|
-
if t is int:
|
|
59
|
-
return SearchFieldDataType.Int32
|
|
60
|
-
if t is float:
|
|
61
|
-
return SearchFieldDataType.Double
|
|
62
|
-
# fallback to string
|
|
63
|
-
return SearchFieldDataType.String
|
|
64
|
-
|
|
65
|
-
def _build_fields(self, sample: Dict[str, Any], key_field: str):
|
|
66
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
|
|
67
|
-
|
|
68
|
-
fields = []
|
|
69
|
-
# ensure key field present
|
|
70
|
-
if key_field not in sample:
|
|
71
|
-
# we'll create a string key, uploader will populate unique ids
|
|
72
|
-
fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
|
|
73
|
-
else:
|
|
74
|
-
typ = self._infer_field_type(sample[key_field])
|
|
75
|
-
fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
|
|
76
|
-
|
|
77
|
-
for k, v in sample.items():
|
|
78
|
-
logger.info(f"================={k}============")
|
|
79
|
-
if k == key_field:
|
|
80
|
-
continue
|
|
81
|
-
typ = self._infer_field_type(v)
|
|
82
|
-
# for strings use SearchableField so full text queries work
|
|
83
|
-
if typ == SearchFieldDataType.String:
|
|
84
|
-
fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
|
|
85
|
-
else:
|
|
86
|
-
fields.append(SimpleField(name=k, type=typ))
|
|
87
|
-
return fields
|
|
88
|
-
|
|
89
|
-
def create_index(self, sample: Dict[str, Any]) -> bool:
|
|
90
|
-
try:
|
|
91
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
|
|
92
|
-
endpoint = self.config["aisearch_endpoint"]
|
|
93
|
-
api_key = self.config["aisearch_api_key"]
|
|
94
|
-
index_name = self.config["aisearch_index_name"]
|
|
95
|
-
key_field = self.config.get("key_field", "id")
|
|
96
|
-
|
|
97
|
-
index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
|
|
98
|
-
fields = self._build_fields(sample, key_field)
|
|
99
|
-
logger.info("=================Creating Index============")
|
|
100
|
-
index = SearchIndex(name=index_name, fields=fields)
|
|
101
|
-
# create or update index
|
|
102
|
-
index_client.create_or_update_index(index)
|
|
103
|
-
logger.info("Azure Search index '%s' created/updated", index_name)
|
|
104
|
-
return True
|
|
105
|
-
except Exception as ex:
|
|
106
|
-
logger.exception("AzureSearchIndexer.create_index failed")
|
|
107
|
-
return False
|
|
108
|
-
|
|
109
|
-
def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
|
|
110
|
-
try:
|
|
111
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
|
|
112
|
-
endpoint = self.config["aisearch_endpoint"]
|
|
113
|
-
api_key = self.config["aisearch_api_key"]
|
|
114
|
-
index_name = self.config["aisearch_index_name"]
|
|
115
|
-
key_field = self.config.get("key_field", "id")
|
|
116
|
-
|
|
117
|
-
# ensure each doc has key_field
|
|
118
|
-
from uuid import uuid4
|
|
119
|
-
for d in docs:
|
|
120
|
-
if key_field not in d:
|
|
121
|
-
d[key_field] = str(uuid4())
|
|
122
|
-
# ensure each doc has key_field is of string type
|
|
123
|
-
for d in docs:
|
|
124
|
-
if key_field in d:
|
|
125
|
-
typ = self._infer_field_type(d[key_field])
|
|
126
|
-
if typ != SearchFieldDataType.String:
|
|
127
|
-
d[key_field] = str(d[key_field])
|
|
128
|
-
|
|
129
|
-
client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
|
|
130
|
-
logger.info("Uploading %d documents to index %s", len(docs), index_name)
|
|
131
|
-
result = client.upload_documents(documents=docs)
|
|
132
|
-
# Check results for failures
|
|
133
|
-
failed = [r for r in result if not r.succeeded]
|
|
134
|
-
if failed:
|
|
135
|
-
logger.error("Some documents failed to upload: %s", failed)
|
|
136
|
-
return False
|
|
137
|
-
logger.info("Uploaded documents successfully")
|
|
138
|
-
return True
|
|
139
|
-
except Exception:
|
|
140
|
-
logger.exception("AzureSearchIndexer.upload_documents failed")
|
|
141
|
-
return False
|
|
142
|
-
|
|
143
|
-
def index(self, rows: List[Dict[str, Any]]) -> bool:
|
|
144
|
-
"""
|
|
145
|
-
High level: create index (based on first row) and upload all rows.
|
|
146
|
-
"""
|
|
147
|
-
if not rows:
|
|
148
|
-
logger.error("AzureSearchIndexer.index called with empty rows")
|
|
149
|
-
return False
|
|
150
|
-
try:
|
|
151
|
-
if not self.validate_config():
|
|
152
|
-
return False
|
|
153
|
-
sample = rows[0]
|
|
154
|
-
logger.info(f"================={sample}============")
|
|
155
|
-
ok = self.create_index(sample)
|
|
156
|
-
if not ok:
|
|
157
|
-
return False
|
|
158
|
-
ok2 = self.upload_documents(rows)
|
|
159
|
-
return ok2
|
|
160
|
-
except Exception:
|
|
161
|
-
logger.exception("AzureSearchIndexer.index failed")
|
|
162
|
-
return False
|
|
@@ -1,286 +0,0 @@
|
|
|
1
|
-
from typing import List, Dict, Any, Optional
|
|
2
|
-
from datasourcelib.utils.logger import get_logger
|
|
3
|
-
|
|
4
|
-
logger = get_logger(__name__)
|
|
5
|
-
|
|
6
|
-
class AzureSearchIndexer:
|
|
7
|
-
"""
|
|
8
|
-
Azure Cognitive Search indexer with vector search support.
|
|
9
|
-
Required vector_db_config:
|
|
10
|
-
- aisearch_endpoint: str
|
|
11
|
-
- aisearch_index_name: str
|
|
12
|
-
- aisearch_api_key
|
|
13
|
-
|
|
14
|
-
Optional vector search config:
|
|
15
|
-
- vectorization: bool (enable vector search)
|
|
16
|
-
- vector_config: dict
|
|
17
|
-
- dimensions: int (default 1024)
|
|
18
|
-
- algorithm: str ('hnsw' or 'flat', default 'hnsw')
|
|
19
|
-
- metric: str ('cosine', 'euclidean', 'dotProduct', default 'cosine')
|
|
20
|
-
- key_field: str (default 'id')
|
|
21
|
-
- vector_field: str (default 'contentVector')
|
|
22
|
-
- embedding_endpoint: str (Azure OpenAI endpoint for embeddings)
|
|
23
|
-
- embedding_key: str (Azure OpenAI API key)
|
|
24
|
-
- embedding_deployment: str (Azure OpenAI model deployment name)
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
def __init__(self, vector_db_config: Dict[str, Any]):
|
|
28
|
-
self.config = vector_db_config or {}
|
|
29
|
-
self._client = None
|
|
30
|
-
self._index_client = None
|
|
31
|
-
self._embedding_client = None
|
|
32
|
-
|
|
33
|
-
def validate_config(self) -> bool:
|
|
34
|
-
required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
|
|
35
|
-
missing = [k for k in required if k not in self.config]
|
|
36
|
-
|
|
37
|
-
# Check vector search requirements if enabled
|
|
38
|
-
if self.config.get("vectorization", False):
|
|
39
|
-
vector_required = ("embedding_endpoint", "embedding_key", "embedding_deployment")
|
|
40
|
-
missing.extend([k for k in vector_required if k not in self.config])
|
|
41
|
-
|
|
42
|
-
if missing:
|
|
43
|
-
logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
|
|
44
|
-
return False
|
|
45
|
-
return True
|
|
46
|
-
|
|
47
|
-
def _ensure_sdk(self):
|
|
48
|
-
try:
|
|
49
|
-
from azure.core.credentials import AzureKeyCredential # type: ignore
|
|
50
|
-
from azure.search.documents import SearchClient # type: ignore
|
|
51
|
-
from azure.search.documents.indexes import SearchIndexClient # type: ignore
|
|
52
|
-
from openai import AzureOpenAI # type: ignore
|
|
53
|
-
from azure.search.documents.indexes.models import (
|
|
54
|
-
SearchIndex,
|
|
55
|
-
SearchField,
|
|
56
|
-
SearchFieldDataType,
|
|
57
|
-
SimpleField,
|
|
58
|
-
SearchableField,
|
|
59
|
-
VectorSearch,
|
|
60
|
-
VectorSearchProfile,
|
|
61
|
-
HnswAlgorithmConfiguration
|
|
62
|
-
) # type: ignore
|
|
63
|
-
|
|
64
|
-
except Exception as e:
|
|
65
|
-
raise RuntimeError("Required packages missing. Install: azure-search-documents openai") from e
|
|
66
|
-
|
|
67
|
-
return (
|
|
68
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
def _setup_embedding_client(self):
|
|
72
|
-
if not self._embedding_client and self.config.get("vectorization"):
|
|
73
|
-
try:
|
|
74
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
75
|
-
self._embedding_client = AzureOpenAI(
|
|
76
|
-
api_version=self.config["embedding_api_version"],
|
|
77
|
-
azure_endpoint=self.config["embedding_endpoint"],
|
|
78
|
-
api_key=self.config["embedding_key"],
|
|
79
|
-
)
|
|
80
|
-
logger.info("Azure OpenAI embedding client initialized")
|
|
81
|
-
except Exception as ex:
|
|
82
|
-
logger.exception("Failed to initialize embedding client")
|
|
83
|
-
raise
|
|
84
|
-
|
|
85
|
-
def _get_embeddings(self, text: str) -> List[float]:
|
|
86
|
-
try:
|
|
87
|
-
self._setup_embedding_client()
|
|
88
|
-
response = self._embedding_client.embeddings.create(
|
|
89
|
-
model=self.config["embedding_deployment"],
|
|
90
|
-
input=text
|
|
91
|
-
)
|
|
92
|
-
return response.data[0].embedding
|
|
93
|
-
except Exception as ex:
|
|
94
|
-
logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
|
|
95
|
-
raise
|
|
96
|
-
|
|
97
|
-
def _build_vector_search_config(self):
|
|
98
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
99
|
-
vector_config = self.config.get("vector_config", {})
|
|
100
|
-
dimensions = vector_config.get("dimensions", 1536)
|
|
101
|
-
|
|
102
|
-
vector_search = VectorSearch(
|
|
103
|
-
profiles=[VectorSearchProfile(name="vector-profile-1", algorithm_configuration_name="algorithms-config-1")],
|
|
104
|
-
algorithms=[HnswAlgorithmConfiguration(name="algorithms-config-1")]
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
return vector_search, dimensions
|
|
108
|
-
|
|
109
|
-
def _infer_field_type(self, value) -> Any:
|
|
110
|
-
#Map Python types to SearchFieldDataType, including collections
|
|
111
|
-
|
|
112
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
113
|
-
|
|
114
|
-
if value is None:
|
|
115
|
-
return SearchFieldDataType.String
|
|
116
|
-
|
|
117
|
-
t = type(value)
|
|
118
|
-
|
|
119
|
-
# Handle list/array types as Collections
|
|
120
|
-
if t in (list, tuple):
|
|
121
|
-
# If empty list, default to Collection of Double
|
|
122
|
-
if not value:
|
|
123
|
-
return SearchFieldDataType.Collection(SearchFieldDataType.Double)
|
|
124
|
-
# Get type of first element for non-empty lists
|
|
125
|
-
element_type = self._infer_field_type(value[0])
|
|
126
|
-
return SearchFieldDataType.Collection(element_type)
|
|
127
|
-
# Handle vector embeddings (list or tuple of floats)
|
|
128
|
-
if type(value) in (list, tuple) and all(isinstance(x, (int, float)) for x in value):
|
|
129
|
-
return SearchFieldDataType.Collection(SearchFieldDataType.Single)
|
|
130
|
-
|
|
131
|
-
# Handle basic types
|
|
132
|
-
logger.info(f"######## Infer field type for value:[ {value} ] of type [ {t} ]")
|
|
133
|
-
if t is bool:
|
|
134
|
-
return SearchFieldDataType.Boolean
|
|
135
|
-
if t is int:
|
|
136
|
-
return SearchFieldDataType.Int32
|
|
137
|
-
if t is float:
|
|
138
|
-
return SearchFieldDataType.Double
|
|
139
|
-
print(f"############## Infer field type for value: {value} of type {t}")
|
|
140
|
-
print(t is str)
|
|
141
|
-
if t is str:
|
|
142
|
-
return SearchFieldDataType.String
|
|
143
|
-
# fallback to string
|
|
144
|
-
logger.warning(f"Falling back to string type for value: {value} of type {t}")
|
|
145
|
-
return SearchFieldDataType.String
|
|
146
|
-
|
|
147
|
-
def _build_fields(self, sample: Dict[str, Any], key_field: str):
|
|
148
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
149
|
-
|
|
150
|
-
fields = []
|
|
151
|
-
# Add key field
|
|
152
|
-
if key_field not in sample:
|
|
153
|
-
fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
|
|
154
|
-
else:
|
|
155
|
-
fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
|
|
156
|
-
|
|
157
|
-
# Add regular fields
|
|
158
|
-
for k, v in sample.items():
|
|
159
|
-
logger.info(f"================={k}============")
|
|
160
|
-
if k == key_field:
|
|
161
|
-
continue
|
|
162
|
-
logger.info(f"#### Infer field type for field: {k}")
|
|
163
|
-
typ = self._infer_field_type(v)
|
|
164
|
-
logger.info(f"#### Inferred type for field {k}: {typ}")
|
|
165
|
-
if typ == SearchFieldDataType.String:
|
|
166
|
-
fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
|
|
167
|
-
else:
|
|
168
|
-
fields.append(SimpleField(name=k, type=typ))
|
|
169
|
-
|
|
170
|
-
# Add vector field if vectorization is enabled
|
|
171
|
-
if self.config.get("vectorization"):
|
|
172
|
-
vector_field = self.config.get("vector_field", "contentVector")
|
|
173
|
-
_, dimensions = self._build_vector_search_config()
|
|
174
|
-
fields.append(
|
|
175
|
-
SearchField(
|
|
176
|
-
name=vector_field,
|
|
177
|
-
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
|
|
178
|
-
searchable=True,
|
|
179
|
-
vector_search_dimensions=dimensions,
|
|
180
|
-
vector_search_profile_name="vector-profile-1"
|
|
181
|
-
)
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
return fields
|
|
185
|
-
|
|
186
|
-
def create_index(self, sample: Dict[str, Any]) -> bool:
|
|
187
|
-
try:
|
|
188
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
189
|
-
|
|
190
|
-
endpoint = self.config["aisearch_endpoint"]
|
|
191
|
-
api_key = self.config["aisearch_api_key"]
|
|
192
|
-
index_name = self.config["aisearch_index_name"]
|
|
193
|
-
key_field = self.config.get("key_field", "id")
|
|
194
|
-
|
|
195
|
-
index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
|
|
196
|
-
fields = self._build_fields(sample, key_field)
|
|
197
|
-
|
|
198
|
-
# Create index with vector search if enabled
|
|
199
|
-
if self.config.get("vectorization"):
|
|
200
|
-
vector_search, _ = self._build_vector_search_config()
|
|
201
|
-
index = SearchIndex(
|
|
202
|
-
name=index_name,
|
|
203
|
-
fields=fields,
|
|
204
|
-
vector_search=vector_search
|
|
205
|
-
)
|
|
206
|
-
else:
|
|
207
|
-
index = SearchIndex(name=index_name, fields=fields)
|
|
208
|
-
|
|
209
|
-
index_client.create_or_update_index(index)
|
|
210
|
-
logger.info(f"Azure Search index '{index_name}' created/updated with vectorization={self.config.get('vectorization', False)}")
|
|
211
|
-
return True
|
|
212
|
-
except Exception as ex:
|
|
213
|
-
logger.exception("AzureSearchIndexer.create_index failed")
|
|
214
|
-
return False
|
|
215
|
-
|
|
216
|
-
def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
|
|
217
|
-
try:
|
|
218
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
219
|
-
endpoint = self.config["aisearch_endpoint"]
|
|
220
|
-
api_key = self.config["aisearch_api_key"]
|
|
221
|
-
index_name = self.config["aisearch_index_name"]
|
|
222
|
-
key_field = self.config.get("key_field", "id")
|
|
223
|
-
|
|
224
|
-
# Add IDs if missing
|
|
225
|
-
from uuid import uuid4
|
|
226
|
-
for d in docs:
|
|
227
|
-
if key_field not in d:
|
|
228
|
-
d[key_field] = str(uuid4())
|
|
229
|
-
elif not isinstance(d[key_field], str):
|
|
230
|
-
d[key_field] = str(d[key_field])
|
|
231
|
-
|
|
232
|
-
# Add vector embeddings if enabled
|
|
233
|
-
if self.config.get("vectorization"):
|
|
234
|
-
vector_field = self.config.get("vector_field", "contentVector")
|
|
235
|
-
content_field = self.config.get("content_field", "content")
|
|
236
|
-
|
|
237
|
-
for doc in docs:
|
|
238
|
-
if content_field in doc:
|
|
239
|
-
try:
|
|
240
|
-
embedding = self._get_embeddings(str(doc[content_field]))
|
|
241
|
-
doc[vector_field] = embedding
|
|
242
|
-
except Exception as e:
|
|
243
|
-
logger.error(f"Failed to get embedding for document {doc.get(key_field)}: {str(e)}")
|
|
244
|
-
continue
|
|
245
|
-
|
|
246
|
-
client = SearchClient(endpoint=endpoint, index_name=index_name,
|
|
247
|
-
credential=AzureKeyCredential(api_key))
|
|
248
|
-
|
|
249
|
-
logger.info(f"Uploading {len(docs)} documents to index {index_name}")
|
|
250
|
-
result = client.upload_documents(documents=docs)
|
|
251
|
-
|
|
252
|
-
failed = [r for r in result if not r.succeeded]
|
|
253
|
-
if failed:
|
|
254
|
-
logger.error(f"Some documents failed to upload: {failed}")
|
|
255
|
-
return False
|
|
256
|
-
|
|
257
|
-
logger.info("Documents uploaded successfully")
|
|
258
|
-
return True
|
|
259
|
-
|
|
260
|
-
except Exception:
|
|
261
|
-
logger.exception("AzureSearchIndexer.upload_documents failed")
|
|
262
|
-
return False
|
|
263
|
-
|
|
264
|
-
def index(self, rows: List[Dict[str, Any]]) -> bool:
|
|
265
|
-
"""High level: create index (based on first row) and upload all rows."""
|
|
266
|
-
if not rows:
|
|
267
|
-
logger.error("AzureSearchIndexer.index called with empty rows")
|
|
268
|
-
return False
|
|
269
|
-
|
|
270
|
-
try:
|
|
271
|
-
if not self.validate_config():
|
|
272
|
-
return False
|
|
273
|
-
|
|
274
|
-
sample = rows[0]
|
|
275
|
-
logger.info(f"Creating/updating index with sample: {sample}")
|
|
276
|
-
|
|
277
|
-
ok = self.create_index(sample)
|
|
278
|
-
if not ok:
|
|
279
|
-
return False
|
|
280
|
-
|
|
281
|
-
ok2 = self.upload_documents(rows)
|
|
282
|
-
return ok2
|
|
283
|
-
|
|
284
|
-
except Exception:
|
|
285
|
-
logger.exception("AzureSearchIndexer.index failed")
|
|
286
|
-
return False
|
|
File without changes
|
|
File without changes
|
|
File without changes
|