datasourcelib 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -110,7 +110,7 @@ class AzureSearchIndexer:
110
110
  logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
111
111
  raise
112
112
 
113
- def _build_vector_search_config(self):
113
+ def _build_vector_search_config_old(self):
114
114
  AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
115
115
  vector_config = self.config.get("vector_config", {})
116
116
  dimensions = vector_config.get("dimensions", 1536)
@@ -121,6 +121,107 @@ class AzureSearchIndexer:
121
121
  )
122
122
 
123
123
  return vector_search, dimensions
124
+
125
+ def _build_vector_search_config(self):
126
+ AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
127
+
128
+ vector_config = self.config.get("vector_config", {})
129
+ dimensions = vector_config.get("dimensions", 1536)
130
+ algorithm = vector_config.get("algorithm", "hnsw").lower()
131
+
132
+ # Build algorithm configuration (SDK model if available)
133
+ alg_cfg = HnswAlgorithmConfiguration(name="algorithms-config-1")
134
+
135
+ # Build vectorizer settings using Azure OpenAI config from vector_db_config
136
+ deployment = self.config.get("embedding_deployment")
137
+ endpoint = self.config.get("embedding_endpoint")
138
+ api_key = self.config.get("embedding_key")
139
+ # modelName required for API version 2025-09-01 — prefer explicit embedding_model, fall back to deployment
140
+ model_name = self.config.get("embedding_model") or deployment
141
+ content_field = self.config.get("content_field", "content")
142
+ vector_field = self.config.get("vector_field", "contentVector")
143
+
144
+ if not model_name:
145
+ raise RuntimeError("Vectorizer configuration requires 'embedding_model' or 'embedding_deployment' in vector_db_config")
146
+
147
+ # Define vectorizer with explicit name and required azureOpenAIParameters including modelName
148
+ vectorizer_name = "azure-openai-vectorizer"
149
+ vectorizer = {
150
+ "name": vectorizer_name,
151
+ "kind": "azureOpenAI",
152
+ "azureOpenAIParameters": {
153
+ "resourceUri": endpoint.rstrip('/') if endpoint else None,
154
+ # include both modelName (required) and deploymentId (if provided)
155
+ "modelName": model_name,
156
+ **({"deploymentId": deployment} if deployment else {}),
157
+ "apiKey": api_key
158
+ },
159
+ "options": {
160
+ "fieldMapping": [
161
+ {
162
+ "sourceContext": f"/document/{content_field}",
163
+ "outputs": [
164
+ {
165
+ "targetContext": f"/document/{vector_field}",
166
+ "targetDimensions": dimensions
167
+ }
168
+ ]
169
+ }
170
+ ]
171
+ }
172
+ }
173
+
174
+ profile_name = "vector-profile-1"
175
+ try:
176
+ # Create profile with vectorizer reference (SDK may expect vectorizer_name or vectorizer depending on version)
177
+ try:
178
+ profile = VectorSearchProfile(
179
+ name=profile_name,
180
+ algorithm_configuration_name="algorithms-config-1",
181
+ vectorizer_name=vectorizer_name
182
+ )
183
+ except TypeError:
184
+ # fallback if SDK constructor uses different parameter names
185
+ profile = VectorSearchProfile(name=profile_name, algorithm_configuration_name="algorithms-config-1")
186
+ try:
187
+ setattr(profile, "vectorizer_name", vectorizer_name)
188
+ except Exception:
189
+ pass
190
+
191
+ try:
192
+ # Construct full vector search config with both profile and vectorizer
193
+ vector_search = VectorSearch(
194
+ profiles=[profile],
195
+ algorithms=[alg_cfg],
196
+ vectorizers=[vectorizer]
197
+ )
198
+ except Exception:
199
+ # Fallback to dict if SDK constructor differs
200
+ vector_search = {
201
+ "profiles": [{
202
+ "name": profile_name,
203
+ "algorithmConfigurationName": "algorithms-config-1",
204
+ "vectorizerName": vectorizer_name
205
+ }],
206
+ "algorithms": [{"name": "algorithms-config-1"}],
207
+ "vectorizers": [vectorizer]
208
+ }
209
+ except Exception:
210
+ # Full dict fallback
211
+ vector_search = {
212
+ "profiles": [{
213
+ "name": profile_name,
214
+ "algorithmConfigurationName": "algorithms-config-1",
215
+ "vectorizerName": vectorizer_name
216
+ }],
217
+ "algorithms": [{"name": "algorithms-config-1"}],
218
+ "vectorizers": [vectorizer]
219
+ }
220
+
221
+ logger.info("Built vector_search config (dimensions=%s, model=%s, vectorizer=%s)",
222
+ dimensions, model_name, vectorizer_name)
223
+ return vector_search, dimensions
224
+
124
225
 
125
226
  def _build_semantic_settings(self):
126
227
  """
@@ -1,6 +1,6 @@
1
1
  from datasourcelib.core.sync_base import SyncBase
2
2
  from datasourcelib.utils.logger import get_logger
3
- from datasourcelib.indexes.azure_search_index_vector import AzureSearchIndexer
3
+ from datasourcelib.indexes.azure_search_index import AzureSearchIndexer
4
4
  logger = get_logger(__name__)
5
5
 
6
6
  class FullLoadStrategy(SyncBase):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Data source sync strategies for vector DBs
5
5
  Home-page: https://github.com/jaiprakash0217/datasourcelib
6
6
  Author: Jai Prakash
@@ -12,12 +12,10 @@ datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMr
12
12
  datasourcelib/datasources/sharepoint_source.py,sha256=Pv9735Gu2FylVeeT9e_cZlCvgGUwxn-pVRRZQe2PHU8,20196
13
13
  datasourcelib/datasources/sql_source.py,sha256=sCYHrmeD82fQVcdQjL9Y2TTTjaqlv2v8B5noAng3Bl4,5450
14
14
  datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
15
- datasourcelib/indexes/azure_search_index.py,sha256=o3BoSxURBk5jCC3AlNz-v9_igg-dXYS4yUxXZwSfqFg,17265
16
- datasourcelib/indexes/azure_search_index_only.py,sha256=SulrYPehWGaf3Wi_Dw8UvFneSY-UwEK9viVYXwIlQuI,7120
17
- datasourcelib/indexes/azure_search_index_vector.py,sha256=4By1vJHv1ORiWOpTqO5wR0sTrq1TaEHP6t8MoOINhok,13410
15
+ datasourcelib/indexes/azure_search_index.py,sha256=kznAz06UXgyT1Clqj6gRhnBQ5HFw40ZQHJElRFIcbRo,22115
18
16
  datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
19
17
  datasourcelib/strategies/daily_load.py,sha256=Rh-veUhxKYsplwHTyko_Zp9C6NkUJV5VAGtg-p7Iy34,856
20
- datasourcelib/strategies/full_load.py,sha256=zqDZZcmyJKXQ4v3coq5njjadlBNI9V8f_lfXVZCoLbQ,1698
18
+ datasourcelib/strategies/full_load.py,sha256=U1a9wO_ZLRnMInvU0IRW-ZKnhu0Cv437VcNMKIYuzMA,1691
21
19
  datasourcelib/strategies/incremental_load.py,sha256=TVqmDLu3m571nqGvzo_69i36QtYe4sBpllFwfPNL0TE,1178
22
20
  datasourcelib/strategies/ondemand_load.py,sha256=VxzAYgrW2ebTOC3xm61CerL2AFehZUJLnKrqtGRGJoE,644
23
21
  datasourcelib/strategies/timerange_load.py,sha256=c62BN2yXwVFaA_dQV54qenP4vrb4rcFqbx6m-nqhaTA,900
@@ -27,8 +25,8 @@ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4f
27
25
  datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
28
26
  datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
29
27
  datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
30
- datasourcelib-0.1.3.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
31
- datasourcelib-0.1.3.dist-info/METADATA,sha256=cPVrPEkPN22sTYOoO20byXcpu5hvKVQIPu3elgyyEko,1185
32
- datasourcelib-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
- datasourcelib-0.1.3.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
34
- datasourcelib-0.1.3.dist-info/RECORD,,
28
+ datasourcelib-0.1.4.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
29
+ datasourcelib-0.1.4.dist-info/METADATA,sha256=LR3db7O_rnbTmF_owLl-lH06xAfP-iZu4aXPtmjVtRo,1185
30
+ datasourcelib-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ datasourcelib-0.1.4.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
32
+ datasourcelib-0.1.4.dist-info/RECORD,,
@@ -1,162 +0,0 @@
1
- from typing import List, Dict, Any, Optional
2
- from datasourcelib.utils.logger import get_logger
3
-
4
- logger = get_logger(__name__)
5
-
6
- class AzureSearchIndexer:
7
- """
8
- Minimal Azure Cognitive Search indexer wrapper.
9
- Expects vector_db_config with:
10
- - service_endpoint: str
11
- - index_name: str
12
- - api_key: str
13
- Optional:
14
- - key_field: name of unique key in documents (default 'id')
15
- """
16
-
17
- def __init__(self, vector_db_config: Dict[str, Any]):
18
- self.config = vector_db_config or {}
19
- self._client = None
20
- self._index_client = None
21
-
22
- def validate_config(self) -> bool:
23
- required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
24
- missing = [k for k in required if k not in self.config]
25
- if missing:
26
- logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
27
- return False
28
- return True
29
-
30
- def _ensure_sdk(self):
31
- try:
32
- from azure.core.credentials import AzureKeyCredential # type: ignore
33
- from azure.search.documents import SearchClient # type: ignore
34
- from azure.search.documents.indexes import SearchIndexClient # type: ignore
35
- from azure.search.documents.indexes.models import (
36
- SearchIndex,
37
- SimpleField,
38
- SearchableField,
39
- SearchFieldDataType,
40
- ) # type: ignore
41
- except Exception as e:
42
- raise RuntimeError("azure-search-documents package is required: install azure-search-documents") from e
43
-
44
- return AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType
45
-
46
- def _infer_field_type(self, value) -> Any:
47
- """
48
- Map Python types to SearchFieldDataType
49
- """
50
- *_, SearchFieldDataType = self._ensure_sdk()
51
- if value is None:
52
- return SearchFieldDataType.String
53
- t = type(value)
54
- if t is str:
55
- return SearchFieldDataType.String
56
- if t is bool:
57
- return SearchFieldDataType.Boolean
58
- if t is int:
59
- return SearchFieldDataType.Int32
60
- if t is float:
61
- return SearchFieldDataType.Double
62
- # fallback to string
63
- return SearchFieldDataType.String
64
-
65
- def _build_fields(self, sample: Dict[str, Any], key_field: str):
66
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
67
-
68
- fields = []
69
- # ensure key field present
70
- if key_field not in sample:
71
- # we'll create a string key, uploader will populate unique ids
72
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
73
- else:
74
- typ = self._infer_field_type(sample[key_field])
75
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
76
-
77
- for k, v in sample.items():
78
- logger.info(f"================={k}============")
79
- if k == key_field:
80
- continue
81
- typ = self._infer_field_type(v)
82
- # for strings use SearchableField so full text queries work
83
- if typ == SearchFieldDataType.String:
84
- fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
85
- else:
86
- fields.append(SimpleField(name=k, type=typ))
87
- return fields
88
-
89
- def create_index(self, sample: Dict[str, Any]) -> bool:
90
- try:
91
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
92
- endpoint = self.config["aisearch_endpoint"]
93
- api_key = self.config["aisearch_api_key"]
94
- index_name = self.config["aisearch_index_name"]
95
- key_field = self.config.get("key_field", "id")
96
-
97
- index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
98
- fields = self._build_fields(sample, key_field)
99
- logger.info("=================Creating Index============")
100
- index = SearchIndex(name=index_name, fields=fields)
101
- # create or update index
102
- index_client.create_or_update_index(index)
103
- logger.info("Azure Search index '%s' created/updated", index_name)
104
- return True
105
- except Exception as ex:
106
- logger.exception("AzureSearchIndexer.create_index failed")
107
- return False
108
-
109
- def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
110
- try:
111
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
112
- endpoint = self.config["aisearch_endpoint"]
113
- api_key = self.config["aisearch_api_key"]
114
- index_name = self.config["aisearch_index_name"]
115
- key_field = self.config.get("key_field", "id")
116
-
117
- # ensure each doc has key_field
118
- from uuid import uuid4
119
- for d in docs:
120
- if key_field not in d:
121
- d[key_field] = str(uuid4())
122
- # ensure each doc has key_field is of string type
123
- for d in docs:
124
- if key_field in d:
125
- typ = self._infer_field_type(d[key_field])
126
- if typ != SearchFieldDataType.String:
127
- d[key_field] = str(d[key_field])
128
-
129
- client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
130
- logger.info("Uploading %d documents to index %s", len(docs), index_name)
131
- result = client.upload_documents(documents=docs)
132
- # Check results for failures
133
- failed = [r for r in result if not r.succeeded]
134
- if failed:
135
- logger.error("Some documents failed to upload: %s", failed)
136
- return False
137
- logger.info("Uploaded documents successfully")
138
- return True
139
- except Exception:
140
- logger.exception("AzureSearchIndexer.upload_documents failed")
141
- return False
142
-
143
- def index(self, rows: List[Dict[str, Any]]) -> bool:
144
- """
145
- High level: create index (based on first row) and upload all rows.
146
- """
147
- if not rows:
148
- logger.error("AzureSearchIndexer.index called with empty rows")
149
- return False
150
- try:
151
- if not self.validate_config():
152
- return False
153
- sample = rows[0]
154
- logger.info(f"================={sample}============")
155
- ok = self.create_index(sample)
156
- if not ok:
157
- return False
158
- ok2 = self.upload_documents(rows)
159
- return ok2
160
- except Exception:
161
- logger.exception("AzureSearchIndexer.index failed")
162
- return False
@@ -1,286 +0,0 @@
1
- from typing import List, Dict, Any, Optional
2
- from datasourcelib.utils.logger import get_logger
3
-
4
- logger = get_logger(__name__)
5
-
6
- class AzureSearchIndexer:
7
- """
8
- Azure Cognitive Search indexer with vector search support.
9
- Required vector_db_config:
10
- - aisearch_endpoint: str
11
- - aisearch_index_name: str
12
- - aisearch_api_key
13
-
14
- Optional vector search config:
15
- - vectorization: bool (enable vector search)
16
- - vector_config: dict
17
- - dimensions: int (default 1024)
18
- - algorithm: str ('hnsw' or 'flat', default 'hnsw')
19
- - metric: str ('cosine', 'euclidean', 'dotProduct', default 'cosine')
20
- - key_field: str (default 'id')
21
- - vector_field: str (default 'contentVector')
22
- - embedding_endpoint: str (Azure OpenAI endpoint for embeddings)
23
- - embedding_key: str (Azure OpenAI API key)
24
- - embedding_deployment: str (Azure OpenAI model deployment name)
25
- """
26
-
27
- def __init__(self, vector_db_config: Dict[str, Any]):
28
- self.config = vector_db_config or {}
29
- self._client = None
30
- self._index_client = None
31
- self._embedding_client = None
32
-
33
- def validate_config(self) -> bool:
34
- required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
35
- missing = [k for k in required if k not in self.config]
36
-
37
- # Check vector search requirements if enabled
38
- if self.config.get("vectorization", False):
39
- vector_required = ("embedding_endpoint", "embedding_key", "embedding_deployment")
40
- missing.extend([k for k in vector_required if k not in self.config])
41
-
42
- if missing:
43
- logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
44
- return False
45
- return True
46
-
47
- def _ensure_sdk(self):
48
- try:
49
- from azure.core.credentials import AzureKeyCredential # type: ignore
50
- from azure.search.documents import SearchClient # type: ignore
51
- from azure.search.documents.indexes import SearchIndexClient # type: ignore
52
- from openai import AzureOpenAI # type: ignore
53
- from azure.search.documents.indexes.models import (
54
- SearchIndex,
55
- SearchField,
56
- SearchFieldDataType,
57
- SimpleField,
58
- SearchableField,
59
- VectorSearch,
60
- VectorSearchProfile,
61
- HnswAlgorithmConfiguration
62
- ) # type: ignore
63
-
64
- except Exception as e:
65
- raise RuntimeError("Required packages missing. Install: azure-search-documents openai") from e
66
-
67
- return (
68
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration
69
- )
70
-
71
- def _setup_embedding_client(self):
72
- if not self._embedding_client and self.config.get("vectorization"):
73
- try:
74
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
75
- self._embedding_client = AzureOpenAI(
76
- api_version=self.config["embedding_api_version"],
77
- azure_endpoint=self.config["embedding_endpoint"],
78
- api_key=self.config["embedding_key"],
79
- )
80
- logger.info("Azure OpenAI embedding client initialized")
81
- except Exception as ex:
82
- logger.exception("Failed to initialize embedding client")
83
- raise
84
-
85
- def _get_embeddings(self, text: str) -> List[float]:
86
- try:
87
- self._setup_embedding_client()
88
- response = self._embedding_client.embeddings.create(
89
- model=self.config["embedding_deployment"],
90
- input=text
91
- )
92
- return response.data[0].embedding
93
- except Exception as ex:
94
- logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
95
- raise
96
-
97
- def _build_vector_search_config(self):
98
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
99
- vector_config = self.config.get("vector_config", {})
100
- dimensions = vector_config.get("dimensions", 1536)
101
-
102
- vector_search = VectorSearch(
103
- profiles=[VectorSearchProfile(name="vector-profile-1", algorithm_configuration_name="algorithms-config-1")],
104
- algorithms=[HnswAlgorithmConfiguration(name="algorithms-config-1")]
105
- )
106
-
107
- return vector_search, dimensions
108
-
109
- def _infer_field_type(self, value) -> Any:
110
- #Map Python types to SearchFieldDataType, including collections
111
-
112
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
113
-
114
- if value is None:
115
- return SearchFieldDataType.String
116
-
117
- t = type(value)
118
-
119
- # Handle list/array types as Collections
120
- if t in (list, tuple):
121
- # If empty list, default to Collection of Double
122
- if not value:
123
- return SearchFieldDataType.Collection(SearchFieldDataType.Double)
124
- # Get type of first element for non-empty lists
125
- element_type = self._infer_field_type(value[0])
126
- return SearchFieldDataType.Collection(element_type)
127
- # Handle vector embeddings (list or tuple of floats)
128
- if type(value) in (list, tuple) and all(isinstance(x, (int, float)) for x in value):
129
- return SearchFieldDataType.Collection(SearchFieldDataType.Single)
130
-
131
- # Handle basic types
132
- logger.info(f"######## Infer field type for value:[ {value} ] of type [ {t} ]")
133
- if t is bool:
134
- return SearchFieldDataType.Boolean
135
- if t is int:
136
- return SearchFieldDataType.Int32
137
- if t is float:
138
- return SearchFieldDataType.Double
139
- print(f"############## Infer field type for value: {value} of type {t}")
140
- print(t is str)
141
- if t is str:
142
- return SearchFieldDataType.String
143
- # fallback to string
144
- logger.warning(f"Falling back to string type for value: {value} of type {t}")
145
- return SearchFieldDataType.String
146
-
147
- def _build_fields(self, sample: Dict[str, Any], key_field: str):
148
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
149
-
150
- fields = []
151
- # Add key field
152
- if key_field not in sample:
153
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
154
- else:
155
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
156
-
157
- # Add regular fields
158
- for k, v in sample.items():
159
- logger.info(f"================={k}============")
160
- if k == key_field:
161
- continue
162
- logger.info(f"#### Infer field type for field: {k}")
163
- typ = self._infer_field_type(v)
164
- logger.info(f"#### Inferred type for field {k}: {typ}")
165
- if typ == SearchFieldDataType.String:
166
- fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
167
- else:
168
- fields.append(SimpleField(name=k, type=typ))
169
-
170
- # Add vector field if vectorization is enabled
171
- if self.config.get("vectorization"):
172
- vector_field = self.config.get("vector_field", "contentVector")
173
- _, dimensions = self._build_vector_search_config()
174
- fields.append(
175
- SearchField(
176
- name=vector_field,
177
- type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
178
- searchable=True,
179
- vector_search_dimensions=dimensions,
180
- vector_search_profile_name="vector-profile-1"
181
- )
182
- )
183
-
184
- return fields
185
-
186
- def create_index(self, sample: Dict[str, Any]) -> bool:
187
- try:
188
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
189
-
190
- endpoint = self.config["aisearch_endpoint"]
191
- api_key = self.config["aisearch_api_key"]
192
- index_name = self.config["aisearch_index_name"]
193
- key_field = self.config.get("key_field", "id")
194
-
195
- index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
196
- fields = self._build_fields(sample, key_field)
197
-
198
- # Create index with vector search if enabled
199
- if self.config.get("vectorization"):
200
- vector_search, _ = self._build_vector_search_config()
201
- index = SearchIndex(
202
- name=index_name,
203
- fields=fields,
204
- vector_search=vector_search
205
- )
206
- else:
207
- index = SearchIndex(name=index_name, fields=fields)
208
-
209
- index_client.create_or_update_index(index)
210
- logger.info(f"Azure Search index '{index_name}' created/updated with vectorization={self.config.get('vectorization', False)}")
211
- return True
212
- except Exception as ex:
213
- logger.exception("AzureSearchIndexer.create_index failed")
214
- return False
215
-
216
- def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
217
- try:
218
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
219
- endpoint = self.config["aisearch_endpoint"]
220
- api_key = self.config["aisearch_api_key"]
221
- index_name = self.config["aisearch_index_name"]
222
- key_field = self.config.get("key_field", "id")
223
-
224
- # Add IDs if missing
225
- from uuid import uuid4
226
- for d in docs:
227
- if key_field not in d:
228
- d[key_field] = str(uuid4())
229
- elif not isinstance(d[key_field], str):
230
- d[key_field] = str(d[key_field])
231
-
232
- # Add vector embeddings if enabled
233
- if self.config.get("vectorization"):
234
- vector_field = self.config.get("vector_field", "contentVector")
235
- content_field = self.config.get("content_field", "content")
236
-
237
- for doc in docs:
238
- if content_field in doc:
239
- try:
240
- embedding = self._get_embeddings(str(doc[content_field]))
241
- doc[vector_field] = embedding
242
- except Exception as e:
243
- logger.error(f"Failed to get embedding for document {doc.get(key_field)}: {str(e)}")
244
- continue
245
-
246
- client = SearchClient(endpoint=endpoint, index_name=index_name,
247
- credential=AzureKeyCredential(api_key))
248
-
249
- logger.info(f"Uploading {len(docs)} documents to index {index_name}")
250
- result = client.upload_documents(documents=docs)
251
-
252
- failed = [r for r in result if not r.succeeded]
253
- if failed:
254
- logger.error(f"Some documents failed to upload: {failed}")
255
- return False
256
-
257
- logger.info("Documents uploaded successfully")
258
- return True
259
-
260
- except Exception:
261
- logger.exception("AzureSearchIndexer.upload_documents failed")
262
- return False
263
-
264
- def index(self, rows: List[Dict[str, Any]]) -> bool:
265
- """High level: create index (based on first row) and upload all rows."""
266
- if not rows:
267
- logger.error("AzureSearchIndexer.index called with empty rows")
268
- return False
269
-
270
- try:
271
- if not self.validate_config():
272
- return False
273
-
274
- sample = rows[0]
275
- logger.info(f"Creating/updating index with sample: {sample}")
276
-
277
- ok = self.create_index(sample)
278
- if not ok:
279
- return False
280
-
281
- ok2 = self.upload_documents(rows)
282
- return ok2
283
-
284
- except Exception:
285
- logger.exception("AzureSearchIndexer.index failed")
286
- return False