datasourcelib 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -110,7 +110,7 @@ class AzureSearchIndexer:
110
110
  logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
111
111
  raise
112
112
 
113
- def _build_vector_search_config(self):
113
+ def _build_vector_search_config_old(self):
114
114
  AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
115
115
  vector_config = self.config.get("vector_config", {})
116
116
  dimensions = vector_config.get("dimensions", 1536)
@@ -121,6 +121,107 @@ class AzureSearchIndexer:
121
121
  )
122
122
 
123
123
  return vector_search, dimensions
124
+
125
+ def _build_vector_search_config(self):
126
+ AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
127
+
128
+ vector_config = self.config.get("vector_config", {})
129
+ dimensions = vector_config.get("dimensions", 1536)
130
+ algorithm = vector_config.get("algorithm", "hnsw").lower()
131
+
132
+ # Build algorithm configuration (SDK model if available)
133
+ alg_cfg = HnswAlgorithmConfiguration(name="algorithms-config-1")
134
+
135
+ # Build vectorizer settings using Azure OpenAI config from vector_db_config
136
+ deployment = self.config.get("embedding_deployment")
137
+ endpoint = self.config.get("embedding_endpoint")
138
+ api_key = self.config.get("embedding_key")
139
+ # modelName required for API version 2025-09-01 — prefer explicit embedding_model, fall back to deployment
140
+ model_name = self.config.get("embedding_model") or deployment
141
+ content_field = self.config.get("content_field", "content")
142
+ vector_field = self.config.get("vector_field", "contentVector")
143
+
144
+ if not model_name:
145
+ raise RuntimeError("Vectorizer configuration requires 'embedding_model' or 'embedding_deployment' in vector_db_config")
146
+
147
+ # Define vectorizer with explicit name and required azureOpenAIParameters including modelName
148
+ vectorizer_name = "azure-openai-vectorizer"
149
+ vectorizer = {
150
+ "name": vectorizer_name,
151
+ "kind": "azureOpenAI",
152
+ "azureOpenAIParameters": {
153
+ "resourceUri": endpoint.rstrip('/') if endpoint else None,
154
+ # include both modelName (required) and deploymentId (if provided)
155
+ "modelName": model_name,
156
+ **({"deploymentId": deployment} if deployment else {}),
157
+ "apiKey": api_key
158
+ },
159
+ "options": {
160
+ "fieldMapping": [
161
+ {
162
+ "sourceContext": f"/document/{content_field}",
163
+ "outputs": [
164
+ {
165
+ "targetContext": f"/document/{vector_field}",
166
+ "targetDimensions": dimensions
167
+ }
168
+ ]
169
+ }
170
+ ]
171
+ }
172
+ }
173
+
174
+ profile_name = "vector-profile-1"
175
+ try:
176
+ # Create profile with vectorizer reference (SDK may expect vectorizer_name or vectorizer depending on version)
177
+ try:
178
+ profile = VectorSearchProfile(
179
+ name=profile_name,
180
+ algorithm_configuration_name="algorithms-config-1",
181
+ vectorizer_name=vectorizer_name
182
+ )
183
+ except TypeError:
184
+ # fallback if SDK constructor uses different parameter names
185
+ profile = VectorSearchProfile(name=profile_name, algorithm_configuration_name="algorithms-config-1")
186
+ try:
187
+ setattr(profile, "vectorizer_name", vectorizer_name)
188
+ except Exception:
189
+ pass
190
+
191
+ try:
192
+ # Construct full vector search config with both profile and vectorizer
193
+ vector_search = VectorSearch(
194
+ profiles=[profile],
195
+ algorithms=[alg_cfg],
196
+ vectorizers=[vectorizer]
197
+ )
198
+ except Exception:
199
+ # Fallback to dict if SDK constructor differs
200
+ vector_search = {
201
+ "profiles": [{
202
+ "name": profile_name,
203
+ "algorithmConfigurationName": "algorithms-config-1",
204
+ "vectorizerName": vectorizer_name
205
+ }],
206
+ "algorithms": [{"name": "algorithms-config-1"}],
207
+ "vectorizers": [vectorizer]
208
+ }
209
+ except Exception:
210
+ # Full dict fallback
211
+ vector_search = {
212
+ "profiles": [{
213
+ "name": profile_name,
214
+ "algorithmConfigurationName": "algorithms-config-1",
215
+ "vectorizerName": vectorizer_name
216
+ }],
217
+ "algorithms": [{"name": "algorithms-config-1"}],
218
+ "vectorizers": [vectorizer]
219
+ }
220
+
221
+ logger.info("Built vector_search config (dimensions=%s, model=%s, vectorizer=%s)",
222
+ dimensions, model_name, vectorizer_name)
223
+ return vector_search, dimensions
224
+
124
225
 
125
226
  def _build_semantic_settings(self):
126
227
  """
@@ -0,0 +1,14 @@
1
+ from .daily_load import DailyLoadStrategy
2
+ from .full_load import FullLoadStrategy
3
+ from .incremental_load import IncrementalLoadStrategy
4
+ from .ondemand_load import OnDemandLoadStrategy
5
+ from .timerange_load import TimeRangeLoadStrategy
6
+
7
+
8
+ __all__ = [
9
+ "DailyLoadStrategy",
10
+ "FullLoadStrategy",
11
+ "IncrementalLoadStrategy",
12
+ "OnDemandLoadStrategy",
13
+ "TimeRangeLoadStrategy"
14
+ ]
@@ -0,0 +1,22 @@
1
+ from datasourcelib.core.sync_base import SyncBase
2
+ from datasourcelib.utils.logger import get_logger
3
+ from datetime import datetime, timedelta
4
+
5
+ logger = get_logger(__name__)
6
+
7
+ class DailyLoadStrategy(SyncBase):
8
+ """Daily scheduled load (wraps incremental)."""
9
+
10
+ def validate(self) -> bool:
11
+ return True
12
+
13
+ def sync(self, run_date: str = None, **kwargs) -> bool:
14
+ try:
15
+ run_date = run_date or datetime.utcnow().date().isoformat()
16
+ logger.info("Starting daily load for %s", run_date)
17
+ # Typically call incremental with last_sync = previous day midnight
18
+ # TODO implement scheduling integration externally; the strategy here is idempotent
19
+ return True
20
+ except Exception:
21
+ logger.exception("DailyLoadStrategy.sync failed")
22
+ return False
@@ -0,0 +1,38 @@
1
+ from datasourcelib.core.sync_base import SyncBase
2
+ from datasourcelib.utils.logger import get_logger
3
+ from datasourcelib.indexes.azure_search_index import AzureSearchIndexer
4
+ logger = get_logger(__name__)
5
+
6
+ class FullLoadStrategy(SyncBase):
7
+ """Full load: replace or reload entire source into vector DB."""
8
+
9
+ def validate(self) -> bool:
10
+ # Minimal validation: required keys exist
11
+ dsok = self.data_source.validate_config()
12
+ return dsok
13
+
14
+ def sync(self, **kwargs) -> bool:
15
+ try:
16
+ logger.info("Running full data load")
17
+ data = self.data_source.fetch_data(**kwargs)
18
+ for key, value in kwargs.items():
19
+ print(f"{key} = {value}")
20
+ # Implement real extract -> transform -> load to vector DB
21
+ # Example pseudocode:
22
+ # vector_client.upsert_batch(self.vector_db_config, rows)
23
+ # New: use AzureSearchIndexer to create index and upload documents if requested
24
+ if isinstance(data, list) and data:
25
+ indexer = AzureSearchIndexer(self.vector_db_config or {})
26
+ if not indexer.validate_config():
27
+ logger.error("Vector DB config invalid for Azure Search indexer")
28
+ return False
29
+ ok = indexer.index(data)
30
+ if not ok:
31
+ logger.error("Indexing data to Azure Search failed")
32
+ return False
33
+
34
+ logger.info("Full data load finished successfully")
35
+ return True
36
+ except Exception:
37
+ logger.exception("FullLoadStrategy.sync failed")
38
+ return False
@@ -0,0 +1,27 @@
1
+ from datetime import datetime
2
+ from datasourcelib.core.sync_base import SyncBase
3
+ from datasourcelib.utils.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+ class IncrementalLoadStrategy(SyncBase):
8
+ """Incremental load using last_sync timestamp or cursor."""
9
+
10
+ def validate(self) -> bool:
11
+ # require source to support incremental field or cursor
12
+ if "cursor_field" not in self.source_config and "last_sync" not in self.source_config:
13
+ logger.error("IncrementalLoadStrategy missing cursor_field or last_sync in source_config")
14
+ return False
15
+ return True
16
+
17
+ def sync(self, last_sync: str = None, **kwargs) -> bool:
18
+ try:
19
+ last = last_sync or self.source_config.get("last_sync")
20
+ logger.info("Running incremental load since %s", last)
21
+ # TODO: fetch delta rows since 'last' and upsert to vector DB
22
+ # After successful run store new last_sync timestamp
23
+ logger.info("Incremental load completed")
24
+ return True
25
+ except Exception:
26
+ logger.exception("IncrementalLoadStrategy.sync failed")
27
+ return False
@@ -0,0 +1,19 @@
1
+ from datasourcelib.core.sync_base import SyncBase
2
+ from datasourcelib.utils.logger import get_logger
3
+
4
+ logger = get_logger(__name__)
5
+
6
+ class OnDemandLoadStrategy(SyncBase):
7
+ """On demand load triggered by user request (arbitrary params)."""
8
+
9
+ def validate(self) -> bool:
10
+ return True
11
+
12
+ def sync(self, **kwargs) -> bool:
13
+ try:
14
+ logger.info("On-demand sync invoked with params: %s", kwargs)
15
+ # Use kwargs to drive partial loads, filters, ids etc.
16
+ return True
17
+ except Exception:
18
+ logger.exception("OnDemandLoadStrategy.sync failed")
19
+ return False
@@ -0,0 +1,24 @@
1
+ from datetime import datetime
2
+ from datasourcelib.core.sync_base import SyncBase
3
+ from datasourcelib.utils.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+ class TimeRangeLoadStrategy(SyncBase):
8
+ """Load records between a start and end timestamp."""
9
+
10
+ def validate(self) -> bool:
11
+ # rely on params at runtime; minimal validation OK
12
+ return True
13
+
14
+ def sync(self, start: str = None, end: str = None, **kwargs) -> bool:
15
+ try:
16
+ if not start or not end:
17
+ logger.error("TimeRangeLoadStrategy requires 'start' and 'end'")
18
+ return False
19
+ logger.info("Time range load between %s and %s", start, end)
20
+ # TODO: query source for timeframe and upsert
21
+ return True
22
+ except Exception:
23
+ logger.exception("TimeRangeLoadStrategy.sync failed")
24
+ return False
@@ -0,0 +1,12 @@
1
+ from .byte_reader import ByteReader
2
+ from .exceptions import DatasourceLibError, SyncStrategyNotFound, DataSourceNotFound
3
+ from .file_reader import FileReader
4
+
5
+
6
+ __all__ = [
7
+ "ByteReader",
8
+ "FileReader",
9
+ "DatasourceLibError",
10
+ "SyncStrategyNotFound",
11
+ "SourceNotFound"
12
+ ]
@@ -0,0 +1,256 @@
1
+ from pathlib import Path
2
+ from typing import Optional, Union, List
3
+ import io
4
+ import pandas as pd
5
+
6
+ # --- Optional helpers ---
7
+ from charset_normalizer import from_bytes as cn_from_bytes
8
+
9
+ # DOCX
10
+ from docx import Document as DocxDocument
11
+
12
+ # PDF
13
+ import fitz # pymupdf
14
+ import pdfplumber
15
+
16
+ # PPTX
17
+ from pptx import Presentation
18
+
19
+ # YAML / XML
20
+ import yaml
21
+ from lxml import etree
22
+ import json
23
+
24
+
25
+ class ByteReader:
26
+ """
27
+ Unified reader for common file types.
28
+ - read_text(path): file path -> text
29
+ - read_table(path): file path -> DataFrame
30
+ - read_text_from_bytes(data, ext): bytes -> text
31
+ - read_table_from_bytes(data, ext): bytes -> DataFrame
32
+ """
33
+
34
+ TEXT_EXTS = {".txt", ".log", ".md"}
35
+ TABLE_EXTS = {".csv", ".tsv", ".xlsx", ".xls"}
36
+ DOCX_EXTS = {".docx"}
37
+ PDF_EXTS = {".pdf"}
38
+ PPTX_EXTS = {".pptx"}
39
+ JSON_EXTS = {".json"}
40
+ YAML_EXTS = {".yaml", ".yml"}
41
+ INI_EXTS = {".ini", ".cfg"}
42
+ XML_EXTS = {".xml"}
43
+
44
+ def __init__(self, default_encoding: str = "utf-8", errors: str = "replace"):
45
+ self.default_encoding = default_encoding
46
+ self.errors = errors
47
+
48
+ # -----------------------
49
+ # Public API (paths)
50
+ # -----------------------
51
+ def read_text(self, path: Union[str, Path]) -> str:
52
+ path = Path(path)
53
+ ext = path.suffix.lower()
54
+
55
+ if ext in self.TEXT_EXTS:
56
+ return path.read_text(encoding=self.default_encoding, errors=self.errors)
57
+
58
+ if ext in self.PDF_EXTS:
59
+ return self._read_pdf_text_path(path)
60
+
61
+ if ext in self.DOCX_EXTS:
62
+ return self._read_docx_text_fp(open(path, "rb"))
63
+
64
+ if ext in self.PPTX_EXTS:
65
+ return self._read_pptx_text_fp(open(path, "rb"))
66
+
67
+ if ext in self.JSON_EXTS:
68
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
69
+ obj = json.load(f)
70
+ return json.dumps(obj, indent=2, ensure_ascii=False)
71
+
72
+ if ext in self.YAML_EXTS:
73
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
74
+ obj = yaml.safe_load(f)
75
+ return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
76
+
77
+ if ext in self.INI_EXTS:
78
+ import configparser
79
+ parser = configparser.ConfigParser()
80
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
81
+ parser.read_file(f)
82
+ output = io.StringIO()
83
+ parser.write(output)
84
+ return output.getvalue()
85
+
86
+ if ext in self.XML_EXTS:
87
+ tree = etree.parse(str(path))
88
+ return etree.tostring(tree, pretty_print=True, encoding="unicode")
89
+
90
+ if ext in self.TABLE_EXTS:
91
+ df = self.read_table(path)
92
+ return df.to_csv(index=False)
93
+
94
+ raise ValueError(f"Unsupported file extension for text extraction: {ext}")
95
+
96
+ def read_table(self, path: Union[str, Path]) -> pd.DataFrame:
97
+ path = Path(path)
98
+ ext = path.suffix.lower()
99
+
100
+ if ext == ".csv":
101
+ return pd.read_csv(path)
102
+ if ext == ".tsv":
103
+ return pd.read_csv(path, sep="\t")
104
+ if ext == ".xlsx":
105
+ return pd.read_excel(path, engine="openpyxl")
106
+ if ext == ".xls":
107
+ return pd.read_excel(path, engine="xlrd")
108
+
109
+ # Fallback: attempt CSV read if unknown
110
+ try:
111
+ return pd.read_csv(path)
112
+ except Exception as e:
113
+ raise ValueError(f"Unsupported file extension for tables: {ext}") from e
114
+
115
+ # -----------------------
116
+ # Public API (bytes)
117
+ # -----------------------
118
+ def read_text_from_bytes(self, data: bytes, ext: str) -> str:
119
+ """
120
+ Extract text from in-memory bytes.
121
+ ext: file extension (e.g., '.pdf', '.docx', '.txt', '.pptx', '.json', '.yaml', '.xml', '.csv', '.xlsx')
122
+ """
123
+ ext = self._normalize_ext(ext)
124
+
125
+ if ext in self.TEXT_EXTS:
126
+ # Robust encoding detection
127
+ res = cn_from_bytes(data).best()
128
+ return str(res) if res else data.decode(self.default_encoding, errors=self.errors)
129
+
130
+ if ext in self.PDF_EXTS:
131
+ return self._read_pdf_text_bytes(data)
132
+
133
+ if ext in self.DOCX_EXTS:
134
+ return self._read_docx_text_fp(io.BytesIO(data))
135
+
136
+ if ext in self.PPTX_EXTS:
137
+ return self._read_pptx_text_fp(io.BytesIO(data))
138
+
139
+ if ext in self.JSON_EXTS:
140
+ obj = json.loads(data.decode(self.default_encoding, errors=self.errors))
141
+ return json.dumps(obj, indent=2, ensure_ascii=False)
142
+
143
+ if ext in self.YAML_EXTS:
144
+ obj = yaml.safe_load(data.decode(self.default_encoding, errors=self.errors))
145
+ return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
146
+
147
+ if ext in self.INI_EXTS:
148
+ import configparser
149
+ parser = configparser.ConfigParser()
150
+ parser.read_string(data.decode(self.default_encoding, errors=self.errors))
151
+ output = io.StringIO()
152
+ parser.write(output)
153
+ return output.getvalue()
154
+
155
+ if ext in self.XML_EXTS:
156
+ tree = etree.parse(io.BytesIO(data))
157
+ return etree.tostring(tree, pretty_print=True, encoding="unicode")
158
+
159
+ if ext in self.TABLE_EXTS:
160
+ df = self.read_table_from_bytes(data, ext)
161
+ return df.to_csv(index=False)
162
+
163
+ raise ValueError(f"Unsupported extension for text extraction from bytes: {ext}")
164
+
165
+ def read_table_from_bytes(self, data: bytes, ext: str) -> pd.DataFrame:
166
+ """
167
+ Load tabular data from in-memory bytes into a DataFrame.
168
+ """
169
+ ext = self._normalize_ext(ext)
170
+
171
+ if ext == ".csv":
172
+ return pd.read_csv(io.BytesIO(data))
173
+ if ext == ".tsv":
174
+ return pd.read_csv(io.BytesIO(data), sep="\t")
175
+ if ext == ".xlsx":
176
+ return pd.read_excel(io.BytesIO(data), engine="openpyxl")
177
+ if ext == ".xls":
178
+ return pd.read_excel(io.BytesIO(data), engine="xlrd")
179
+
180
+ # Opportunistic fallback: try CSV
181
+ try:
182
+ return pd.read_csv(io.BytesIO(data))
183
+ except Exception as e:
184
+ raise ValueError(f"Unsupported extension for table reading from bytes: {ext}") from e
185
+
186
+ # -----------------------
187
+ # Internal helpers
188
+ # -----------------------
189
+ def _normalize_ext(self, ext: str) -> str:
190
+ ext = (ext or "").strip().lower()
191
+ if not ext.startswith("."):
192
+ ext = "." + ext
193
+ return ext
194
+
195
+ def _read_pdf_text_path(self, path: Path) -> str:
196
+ # Prefer PyMuPDF
197
+ try:
198
+ parts: List[str] = []
199
+ with fitz.open(str(path)) as doc:
200
+ if doc.is_encrypted and not doc.authenticate(""):
201
+ raise RuntimeError("Encrypted PDF requires a password.")
202
+ for page in doc:
203
+ parts.append(page.get_text("text"))
204
+ text = "\n\n".join(parts).strip()
205
+ if text:
206
+ return text
207
+ except Exception:
208
+ pass
209
+
210
+ # Fallback: pdfplumber
211
+ with pdfplumber.open(str(path)) as pdf:
212
+ return "\n\n".join([(p.extract_text() or "") for p in pdf.pages]).strip()
213
+
214
+ def _read_pdf_text_bytes(self, data: bytes) -> str:
215
+ # PyMuPDF can open from bytes
216
+ try:
217
+ doc = fitz.open(stream=data, filetype="pdf")
218
+ parts: List[str] = []
219
+ if doc.is_encrypted and not doc.authenticate(""):
220
+ raise RuntimeError("Encrypted PDF requires a password.")
221
+ for page in doc:
222
+ parts.append(page.get_text("text"))
223
+ doc.close()
224
+ text = "\n\n".join(parts).strip()
225
+ if text:
226
+ return text
227
+ except Exception:
228
+ pass
229
+
230
+ # Fallback to pdfplumber from BytesIO
231
+ with pdfplumber.open(io.BytesIO(data)) as pdf:
232
+ return "\n\n".join([(p.extract_text() or "") for p in pdf.pages]).strip()
233
+
234
+ def _read_docx_text_fp(self, fp) -> str:
235
+ doc = DocxDocument(fp)
236
+ chunks = []
237
+ for p in doc.paragraphs:
238
+ if p.text:
239
+ chunks.append(p.text)
240
+ for table in doc.tables:
241
+ for row in table.rows:
242
+ cells = [cell.text.strip() for cell in row.cells]
243
+ if any(cells):
244
+ chunks.append("\t".join(cells))
245
+ return "\n".join(chunks).strip()
246
+
247
+ def _read_pptx_text_fp(self, fp) -> str:
248
+ prs = Presentation(fp)
249
+ chunks = []
250
+ for slide in prs.slides:
251
+ for shape in slide.shapes:
252
+ if hasattr(shape, "has_text_frame") and shape.has_text_frame:
253
+ text = shape.text or ""
254
+ if text:
255
+ chunks.append(text)
256
+ return "\n".join(chunks).strip()
@@ -0,0 +1,9 @@
1
+ class DatasourceLibError(Exception):
2
+ """Base exception for datasourcelib."""
3
+
4
+ class SyncStrategyNotFound(DatasourceLibError):
5
+ """Raised when a strategy is not found."""
6
+
7
+ # Added: DataSourceNotFound to represent missing/unknown data sources
8
+ class DataSourceNotFound(DatasourceLibError):
9
+ """Raised when a data source is not found or not registered."""
@@ -0,0 +1,217 @@
1
+ from pathlib import Path
2
+ from typing import Optional, Union, List
3
+ import io
4
+ import pandas as pd
5
+
6
+ # --- Optional helpers ---
7
+ from charset_normalizer import from_path as cn_from_path
8
+
9
+ # DOCX
10
+ from docx import Document as DocxDocument
11
+
12
+ # PDF
13
+ import fitz # pymupdf
14
+ import pdfplumber
15
+
16
+ # PPTX
17
+ from pptx import Presentation
18
+
19
+ # YAML / XML
20
+ import yaml
21
+ from lxml import etree
22
+
23
+
24
+ class FileReader:
25
+ """
26
+ A unified reader for common file types.
27
+ - read_text(path): extract text from txt, pdf, docx, pptx, json, yaml, ini, xml
28
+ - read_table(path): load tabular data from csv, tsv, xlsx/xls
29
+ """
30
+
31
+ TEXT_EXTS = {".txt", ".log", ".md"}
32
+ TABLE_EXTS = {".csv", ".tsv", ".xlsx", ".xls"}
33
+ DOCX_EXTS = {".docx"}
34
+ PDF_EXTS = {".pdf"}
35
+ PPTX_EXTS = {".pptx"}
36
+ JSON_EXTS = {".json"}
37
+ YAML_EXTS = {".yaml", ".yml"}
38
+ INI_EXTS = {".ini", ".cfg"}
39
+ XML_EXTS = {".xml"}
40
+
41
+ def __init__(self, default_encoding: str = "utf-8", errors: str = "replace"):
42
+ self.default_encoding = default_encoding
43
+ self.errors = errors
44
+
45
+ # -----------------------
46
+ # Public API
47
+ # -----------------------
48
+ def read_text(self, path: Union[str, Path]) -> str:
49
+ """
50
+ Extract best-effort text from a given file based on extension.
51
+ """
52
+ path = Path(path)
53
+ ext = path.suffix.lower()
54
+
55
+ if ext in self.TEXT_EXTS:
56
+ return self._read_plain_text(path)
57
+
58
+ if ext in self.PDF_EXTS:
59
+ return self._read_pdf_text(path)
60
+
61
+ if ext in self.DOCX_EXTS:
62
+ return self._read_docx_text(path)
63
+
64
+ if ext in self.PPTX_EXTS:
65
+ return self._read_pptx_text(path)
66
+
67
+ if ext in self.JSON_EXTS:
68
+ return self._read_json_text(path)
69
+
70
+ if ext in self.YAML_EXTS:
71
+ return self._read_yaml_text(path)
72
+
73
+ if ext in self.INI_EXTS:
74
+ return self._read_ini_text(path)
75
+
76
+ if ext in self.XML_EXTS:
77
+ return self._read_xml_text(path)
78
+
79
+ if ext in self.TABLE_EXTS:
80
+ # For tabular files, provide a quick text representation
81
+ df = self.read_table(path)
82
+ return df.to_csv(index=False)
83
+
84
+ raise ValueError(f"Unsupported file extension for text extraction: {ext}")
85
+
86
+ def read_table(self, path: Union[str, Path]) -> pd.DataFrame:
87
+ """
88
+ Load tabular data from CSV/TSV/Excel, returning a DataFrame.
89
+ """
90
+ path = Path(path)
91
+ ext = path.suffix.lower()
92
+
93
+ if ext == ".csv":
94
+ return pd.read_csv(path)
95
+ if ext == ".tsv":
96
+ return pd.read_csv(path, sep="\t")
97
+ if ext == ".xlsx":
98
+ return pd.read_excel(path, engine="openpyxl")
99
+ if ext == ".xls":
100
+ return pd.read_excel(path, engine="xlrd")
101
+
102
+ # Fallback: attempt CSV read if unknown
103
+ try:
104
+ return pd.read_csv(path)
105
+ except Exception as e:
106
+ raise ValueError(f"Unsupported file extension for tables: {ext}") from e
107
+
108
+ # -----------------------
109
+ # Text readers
110
+ # -----------------------
111
+ def _read_plain_text(self, path: Path) -> str:
112
+ # Detect encoding for robustness
113
+ res = cn_from_path(str(path)).best()
114
+ if res:
115
+ return str(res)
116
+ # Fallback to configured defaults
117
+ return path.read_text(encoding=self.default_encoding, errors=self.errors)
118
+
119
+ def _read_pdf_text(self, path: Path) -> str:
120
+ # Try PyMuPDF (fast, layout-aware)
121
+ try:
122
+ text_parts: List[str] = []
123
+ with fitz.open(str(path)) as doc:
124
+ if doc.can_save and doc.is_encrypted:
125
+ # If encrypted and requires a password, this will fail to extract text.
126
+ if not doc.authenticate(""):
127
+ raise RuntimeError("Encrypted PDF requires a password.")
128
+ for page in doc:
129
+ text_parts.append(page.get_text("text"))
130
+ text = "\n".join(text_parts).strip()
131
+ if text:
132
+ return text
133
+ except Exception:
134
+ pass
135
+
136
+ # Fallback to pdfplumber (good for tables/structured text)
137
+ try:
138
+ text_parts = []
139
+ with pdfplumber.open(str(path)) as pdf:
140
+ for page in pdf.pages:
141
+ t = page.extract_text() or ""
142
+ text_parts.append(t)
143
+ return "\n".join(text_parts).strip()
144
+ except Exception as e:
145
+ raise RuntimeError(f"Failed to read PDF: {e}") from e
146
+
147
+ def _read_docx_text(self, path: Path) -> str:
148
+ doc = DocxDocument(str(path))
149
+ chunks = []
150
+ # Paragraphs
151
+ for p in doc.paragraphs:
152
+ if p.text:
153
+ chunks.append(p.text)
154
+ # Tables (optional: include)
155
+ for table in doc.tables:
156
+ for row in table.rows:
157
+ cells = [cell.text.strip() for cell in row.cells]
158
+ if any(cells):
159
+ chunks.append("\t".join(cells))
160
+ return "\n".join(chunks).strip()
161
+
162
+ def _read_pptx_text(self, path: Path) -> str:
163
+ prs = Presentation(str(path))
164
+ chunks = []
165
+ for slide in prs.slides:
166
+ for shape in slide.shapes:
167
+ if hasattr(shape, "text") and shape.has_text_frame:
168
+ text = shape.text if hasattr(shape, "text") else ""
169
+ if text:
170
+ chunks.append(text)
171
+ return "\n".join(chunks).strip()
172
+
173
+ def _read_json_text(self, path: Path) -> str:
174
+ import json
175
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
176
+ obj = json.load(f)
177
+ # Pretty-print
178
+ return json.dumps(obj, indent=2, ensure_ascii=False)
179
+
180
+ def _read_yaml_text(self, path: Path) -> str:
181
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
182
+ obj = yaml.safe_load(f)
183
+ return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
184
+
185
+ def _read_ini_text(self, path: Path) -> str:
186
+ import configparser
187
+ parser = configparser.ConfigParser()
188
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
189
+ # INI files might have duplicate keys; defaults handle many cases
190
+ parser.read_file(f)
191
+ output = io.StringIO()
192
+ parser.write(output)
193
+ return output.getvalue()
194
+
195
+ def _read_xml_text(self, path: Path) -> str:
196
+ # Pretty-print XML
197
+ tree = etree.parse(str(path))
198
+ return etree.tostring(tree, pretty_print=True, encoding="unicode")
199
+
200
+
201
+ # -----------------------
202
+ # Example usage
203
+ # -----------------------
204
+ #if __name__ == "__main__":
205
+ # reader = FileReader()
206
+
207
+ # 1) Extract text
208
+ # print(reader.read_text("document.pdf"))
209
+ # print(reader.read_text("report.docx"))
210
+ # print(reader.read_text("slides.pptx"))
211
+ # print(reader.read_text("notes.txt"))
212
+ # print(reader.read_text("config.yaml"))
213
+ # print(reader.read_text("data.xml"))
214
+
215
+ # 2) Load tabular data
216
+ # df = reader.read_table("data.xlsx")
217
+ # print(df.head())
@@ -0,0 +1,12 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ def get_logger(name: Optional[str] = None) -> logging.Logger:
5
+ logger = logging.getLogger(name or __name__)
6
+ if not logger.handlers:
7
+ h = logging.StreamHandler()
8
+ fmt = "%(asctime)s %(levelname)s %(name)s %(message)s"
9
+ h.setFormatter(logging.Formatter(fmt))
10
+ logger.addHandler(h)
11
+ logger.setLevel(logging.INFO)
12
+ return logger
@@ -0,0 +1,7 @@
1
+ from typing import Dict
2
+
3
+ def require_keys(cfg: Dict, keys):
4
+ missing = [k for k in keys if k not in cfg]
5
+ if missing:
6
+ raise KeyError(f"Missing required keys: {missing}")
7
+ return True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Data source sync strategies for vector DBs
5
5
  Home-page: https://github.com/jaiprakash0217/datasourcelib
6
6
  Author: Jai Prakash
@@ -0,0 +1,32 @@
1
+ datasourcelib/__init__.py,sha256=I7JTSZ1J6ULg_TfdMEgFcd1regkCHuyKdZT4DcPtoyQ,78
2
+ datasourcelib/core/__init__.py,sha256=nsXojDd97T7eMqqtCsZr1qSYLBitvKydSZRb9Dg7hqU,462
3
+ datasourcelib/core/sync_base.py,sha256=AfwwaV3rJOFKVmKKpSj-BwznnCDCaeuT4LLNDfA3NAY,716
4
+ datasourcelib/core/sync_manager.py,sha256=lj070S3PwSNcB0UL_ZDzDAm6uJ9G38TY491vQZ1dL3o,3849
5
+ datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
6
+ datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
7
+ datasourcelib/datasources/azure_devops_source.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
8
+ datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
9
+ datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
10
+ datasourcelib/datasources/datasource_types.py,sha256=eEiWymYS05X_TxwuB7P3MpphPG1En67h3kRiSGeHjQ0,176
11
+ datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
12
+ datasourcelib/datasources/sharepoint_source.py,sha256=Pv9735Gu2FylVeeT9e_cZlCvgGUwxn-pVRRZQe2PHU8,20196
13
+ datasourcelib/datasources/sql_source.py,sha256=sCYHrmeD82fQVcdQjL9Y2TTTjaqlv2v8B5noAng3Bl4,5450
14
+ datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
15
+ datasourcelib/indexes/azure_search_index.py,sha256=kznAz06UXgyT1Clqj6gRhnBQ5HFw40ZQHJElRFIcbRo,22115
16
+ datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
17
+ datasourcelib/strategies/daily_load.py,sha256=Rh-veUhxKYsplwHTyko_Zp9C6NkUJV5VAGtg-p7Iy34,856
18
+ datasourcelib/strategies/full_load.py,sha256=U1a9wO_ZLRnMInvU0IRW-ZKnhu0Cv437VcNMKIYuzMA,1691
19
+ datasourcelib/strategies/incremental_load.py,sha256=TVqmDLu3m571nqGvzo_69i36QtYe4sBpllFwfPNL0TE,1178
20
+ datasourcelib/strategies/ondemand_load.py,sha256=VxzAYgrW2ebTOC3xm61CerL2AFehZUJLnKrqtGRGJoE,644
21
+ datasourcelib/strategies/timerange_load.py,sha256=c62BN2yXwVFaA_dQV54qenP4vrb4rcFqbx6m-nqhaTA,900
22
+ datasourcelib/utils/__init__.py,sha256=9pSIpaK-kdmNuDzwl0Z7QU-_lV3cZE-iwOEPh3RBBTs,298
23
+ datasourcelib/utils/byte_reader.py,sha256=GaoPXwJa2YTWG1Kim0K6JG20eVSaWkZJd1o9bswxHmc,9082
24
+ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4fYQww,369
25
+ datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
26
+ datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
27
+ datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
28
+ datasourcelib-0.1.4.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
29
+ datasourcelib-0.1.4.dist-info/METADATA,sha256=LR3db7O_rnbTmF_owLl-lH06xAfP-iZu4aXPtmjVtRo,1185
30
+ datasourcelib-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ datasourcelib-0.1.4.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
32
+ datasourcelib-0.1.4.dist-info/RECORD,,
@@ -1,162 +0,0 @@
1
- from typing import List, Dict, Any, Optional
2
- from datasourcelib.utils.logger import get_logger
3
-
4
- logger = get_logger(__name__)
5
-
6
- class AzureSearchIndexer:
7
- """
8
- Minimal Azure Cognitive Search indexer wrapper.
9
- Expects vector_db_config with:
10
- - service_endpoint: str
11
- - index_name: str
12
- - api_key: str
13
- Optional:
14
- - key_field: name of unique key in documents (default 'id')
15
- """
16
-
17
- def __init__(self, vector_db_config: Dict[str, Any]):
18
- self.config = vector_db_config or {}
19
- self._client = None
20
- self._index_client = None
21
-
22
- def validate_config(self) -> bool:
23
- required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
24
- missing = [k for k in required if k not in self.config]
25
- if missing:
26
- logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
27
- return False
28
- return True
29
-
30
- def _ensure_sdk(self):
31
- try:
32
- from azure.core.credentials import AzureKeyCredential # type: ignore
33
- from azure.search.documents import SearchClient # type: ignore
34
- from azure.search.documents.indexes import SearchIndexClient # type: ignore
35
- from azure.search.documents.indexes.models import (
36
- SearchIndex,
37
- SimpleField,
38
- SearchableField,
39
- SearchFieldDataType,
40
- ) # type: ignore
41
- except Exception as e:
42
- raise RuntimeError("azure-search-documents package is required: install azure-search-documents") from e
43
-
44
- return AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType
45
-
46
- def _infer_field_type(self, value) -> Any:
47
- """
48
- Map Python types to SearchFieldDataType
49
- """
50
- *_, SearchFieldDataType = self._ensure_sdk()
51
- if value is None:
52
- return SearchFieldDataType.String
53
- t = type(value)
54
- if t is str:
55
- return SearchFieldDataType.String
56
- if t is bool:
57
- return SearchFieldDataType.Boolean
58
- if t is int:
59
- return SearchFieldDataType.Int32
60
- if t is float:
61
- return SearchFieldDataType.Double
62
- # fallback to string
63
- return SearchFieldDataType.String
64
-
65
- def _build_fields(self, sample: Dict[str, Any], key_field: str):
66
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
67
-
68
- fields = []
69
- # ensure key field present
70
- if key_field not in sample:
71
- # we'll create a string key, uploader will populate unique ids
72
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
73
- else:
74
- typ = self._infer_field_type(sample[key_field])
75
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
76
-
77
- for k, v in sample.items():
78
- logger.info(f"================={k}============")
79
- if k == key_field:
80
- continue
81
- typ = self._infer_field_type(v)
82
- # for strings use SearchableField so full text queries work
83
- if typ == SearchFieldDataType.String:
84
- fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
85
- else:
86
- fields.append(SimpleField(name=k, type=typ))
87
- return fields
88
-
89
- def create_index(self, sample: Dict[str, Any]) -> bool:
90
- try:
91
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
92
- endpoint = self.config["aisearch_endpoint"]
93
- api_key = self.config["aisearch_api_key"]
94
- index_name = self.config["aisearch_index_name"]
95
- key_field = self.config.get("key_field", "id")
96
-
97
- index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
98
- fields = self._build_fields(sample, key_field)
99
- logger.info("=================Creating Index============")
100
- index = SearchIndex(name=index_name, fields=fields)
101
- # create or update index
102
- index_client.create_or_update_index(index)
103
- logger.info("Azure Search index '%s' created/updated", index_name)
104
- return True
105
- except Exception as ex:
106
- logger.exception("AzureSearchIndexer.create_index failed")
107
- return False
108
-
109
- def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
110
- try:
111
- AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
112
- endpoint = self.config["aisearch_endpoint"]
113
- api_key = self.config["aisearch_api_key"]
114
- index_name = self.config["aisearch_index_name"]
115
- key_field = self.config.get("key_field", "id")
116
-
117
- # ensure each doc has key_field
118
- from uuid import uuid4
119
- for d in docs:
120
- if key_field not in d:
121
- d[key_field] = str(uuid4())
122
- # ensure each doc has key_field is of string type
123
- for d in docs:
124
- if key_field in d:
125
- typ = self._infer_field_type(d[key_field])
126
- if typ != SearchFieldDataType.String:
127
- d[key_field] = str(d[key_field])
128
-
129
- client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
130
- logger.info("Uploading %d documents to index %s", len(docs), index_name)
131
- result = client.upload_documents(documents=docs)
132
- # Check results for failures
133
- failed = [r for r in result if not r.succeeded]
134
- if failed:
135
- logger.error("Some documents failed to upload: %s", failed)
136
- return False
137
- logger.info("Uploaded documents successfully")
138
- return True
139
- except Exception:
140
- logger.exception("AzureSearchIndexer.upload_documents failed")
141
- return False
142
-
143
- def index(self, rows: List[Dict[str, Any]]) -> bool:
144
- """
145
- High level: create index (based on first row) and upload all rows.
146
- """
147
- if not rows:
148
- logger.error("AzureSearchIndexer.index called with empty rows")
149
- return False
150
- try:
151
- if not self.validate_config():
152
- return False
153
- sample = rows[0]
154
- logger.info(f"================={sample}============")
155
- ok = self.create_index(sample)
156
- if not ok:
157
- return False
158
- ok2 = self.upload_documents(rows)
159
- return ok2
160
- except Exception:
161
- logger.exception("AzureSearchIndexer.index failed")
162
- return False
@@ -1,286 +0,0 @@
1
- from typing import List, Dict, Any, Optional
2
- from datasourcelib.utils.logger import get_logger
3
-
4
- logger = get_logger(__name__)
5
-
6
- class AzureSearchIndexer:
7
- """
8
- Azure Cognitive Search indexer with vector search support.
9
- Required vector_db_config:
10
- - aisearch_endpoint: str
11
- - aisearch_index_name: str
12
- - aisearch_api_key
13
-
14
- Optional vector search config:
15
- - vectorization: bool (enable vector search)
16
- - vector_config: dict
17
- - dimensions: int (default 1024)
18
- - algorithm: str ('hnsw' or 'flat', default 'hnsw')
19
- - metric: str ('cosine', 'euclidean', 'dotProduct', default 'cosine')
20
- - key_field: str (default 'id')
21
- - vector_field: str (default 'contentVector')
22
- - embedding_endpoint: str (Azure OpenAI endpoint for embeddings)
23
- - embedding_key: str (Azure OpenAI API key)
24
- - embedding_deployment: str (Azure OpenAI model deployment name)
25
- """
26
-
27
- def __init__(self, vector_db_config: Dict[str, Any]):
28
- self.config = vector_db_config or {}
29
- self._client = None
30
- self._index_client = None
31
- self._embedding_client = None
32
-
33
- def validate_config(self) -> bool:
34
- required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
35
- missing = [k for k in required if k not in self.config]
36
-
37
- # Check vector search requirements if enabled
38
- if self.config.get("vectorization", False):
39
- vector_required = ("embedding_endpoint", "embedding_key", "embedding_deployment")
40
- missing.extend([k for k in vector_required if k not in self.config])
41
-
42
- if missing:
43
- logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
44
- return False
45
- return True
46
-
47
- def _ensure_sdk(self):
48
- try:
49
- from azure.core.credentials import AzureKeyCredential # type: ignore
50
- from azure.search.documents import SearchClient # type: ignore
51
- from azure.search.documents.indexes import SearchIndexClient # type: ignore
52
- from openai import AzureOpenAI # type: ignore
53
- from azure.search.documents.indexes.models import (
54
- SearchIndex,
55
- SearchField,
56
- SearchFieldDataType,
57
- SimpleField,
58
- SearchableField,
59
- VectorSearch,
60
- VectorSearchProfile,
61
- HnswAlgorithmConfiguration
62
- ) # type: ignore
63
-
64
- except Exception as e:
65
- raise RuntimeError("Required packages missing. Install: azure-search-documents openai") from e
66
-
67
- return (
68
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration
69
- )
70
-
71
- def _setup_embedding_client(self):
72
- if not self._embedding_client and self.config.get("vectorization"):
73
- try:
74
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
75
- self._embedding_client = AzureOpenAI(
76
- api_version=self.config["embedding_api_version"],
77
- azure_endpoint=self.config["embedding_endpoint"],
78
- api_key=self.config["embedding_key"],
79
- )
80
- logger.info("Azure OpenAI embedding client initialized")
81
- except Exception as ex:
82
- logger.exception("Failed to initialize embedding client")
83
- raise
84
-
85
- def _get_embeddings(self, text: str) -> List[float]:
86
- try:
87
- self._setup_embedding_client()
88
- response = self._embedding_client.embeddings.create(
89
- model=self.config["embedding_deployment"],
90
- input=text
91
- )
92
- return response.data[0].embedding
93
- except Exception as ex:
94
- logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
95
- raise
96
-
97
- def _build_vector_search_config(self):
98
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
99
- vector_config = self.config.get("vector_config", {})
100
- dimensions = vector_config.get("dimensions", 1536)
101
-
102
- vector_search = VectorSearch(
103
- profiles=[VectorSearchProfile(name="vector-profile-1", algorithm_configuration_name="algorithms-config-1")],
104
- algorithms=[HnswAlgorithmConfiguration(name="algorithms-config-1")]
105
- )
106
-
107
- return vector_search, dimensions
108
-
109
- def _infer_field_type(self, value) -> Any:
110
- #Map Python types to SearchFieldDataType, including collections
111
-
112
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
113
-
114
- if value is None:
115
- return SearchFieldDataType.String
116
-
117
- t = type(value)
118
-
119
- # Handle list/array types as Collections
120
- if t in (list, tuple):
121
- # If empty list, default to Collection of Double
122
- if not value:
123
- return SearchFieldDataType.Collection(SearchFieldDataType.Double)
124
- # Get type of first element for non-empty lists
125
- element_type = self._infer_field_type(value[0])
126
- return SearchFieldDataType.Collection(element_type)
127
- # Handle vector embeddings (list or tuple of floats)
128
- if type(value) in (list, tuple) and all(isinstance(x, (int, float)) for x in value):
129
- return SearchFieldDataType.Collection(SearchFieldDataType.Single)
130
-
131
- # Handle basic types
132
- logger.info(f"######## Infer field type for value:[ {value} ] of type [ {t} ]")
133
- if t is bool:
134
- return SearchFieldDataType.Boolean
135
- if t is int:
136
- return SearchFieldDataType.Int32
137
- if t is float:
138
- return SearchFieldDataType.Double
139
- print(f"############## Infer field type for value: {value} of type {t}")
140
- print(t is str)
141
- if t is str:
142
- return SearchFieldDataType.String
143
- # fallback to string
144
- logger.warning(f"Falling back to string type for value: {value} of type {t}")
145
- return SearchFieldDataType.String
146
-
147
- def _build_fields(self, sample: Dict[str, Any], key_field: str):
148
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
149
-
150
- fields = []
151
- # Add key field
152
- if key_field not in sample:
153
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
154
- else:
155
- fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
156
-
157
- # Add regular fields
158
- for k, v in sample.items():
159
- logger.info(f"================={k}============")
160
- if k == key_field:
161
- continue
162
- logger.info(f"#### Infer field type for field: {k}")
163
- typ = self._infer_field_type(v)
164
- logger.info(f"#### Inferred type for field {k}: {typ}")
165
- if typ == SearchFieldDataType.String:
166
- fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
167
- else:
168
- fields.append(SimpleField(name=k, type=typ))
169
-
170
- # Add vector field if vectorization is enabled
171
- if self.config.get("vectorization"):
172
- vector_field = self.config.get("vector_field", "contentVector")
173
- _, dimensions = self._build_vector_search_config()
174
- fields.append(
175
- SearchField(
176
- name=vector_field,
177
- type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
178
- searchable=True,
179
- vector_search_dimensions=dimensions,
180
- vector_search_profile_name="vector-profile-1"
181
- )
182
- )
183
-
184
- return fields
185
-
186
- def create_index(self, sample: Dict[str, Any]) -> bool:
187
- try:
188
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
189
-
190
- endpoint = self.config["aisearch_endpoint"]
191
- api_key = self.config["aisearch_api_key"]
192
- index_name = self.config["aisearch_index_name"]
193
- key_field = self.config.get("key_field", "id")
194
-
195
- index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
196
- fields = self._build_fields(sample, key_field)
197
-
198
- # Create index with vector search if enabled
199
- if self.config.get("vectorization"):
200
- vector_search, _ = self._build_vector_search_config()
201
- index = SearchIndex(
202
- name=index_name,
203
- fields=fields,
204
- vector_search=vector_search
205
- )
206
- else:
207
- index = SearchIndex(name=index_name, fields=fields)
208
-
209
- index_client.create_or_update_index(index)
210
- logger.info(f"Azure Search index '{index_name}' created/updated with vectorization={self.config.get('vectorization', False)}")
211
- return True
212
- except Exception as ex:
213
- logger.exception("AzureSearchIndexer.create_index failed")
214
- return False
215
-
216
- def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
217
- try:
218
- AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
219
- endpoint = self.config["aisearch_endpoint"]
220
- api_key = self.config["aisearch_api_key"]
221
- index_name = self.config["aisearch_index_name"]
222
- key_field = self.config.get("key_field", "id")
223
-
224
- # Add IDs if missing
225
- from uuid import uuid4
226
- for d in docs:
227
- if key_field not in d:
228
- d[key_field] = str(uuid4())
229
- elif not isinstance(d[key_field], str):
230
- d[key_field] = str(d[key_field])
231
-
232
- # Add vector embeddings if enabled
233
- if self.config.get("vectorization"):
234
- vector_field = self.config.get("vector_field", "contentVector")
235
- content_field = self.config.get("content_field", "content")
236
-
237
- for doc in docs:
238
- if content_field in doc:
239
- try:
240
- embedding = self._get_embeddings(str(doc[content_field]))
241
- doc[vector_field] = embedding
242
- except Exception as e:
243
- logger.error(f"Failed to get embedding for document {doc.get(key_field)}: {str(e)}")
244
- continue
245
-
246
- client = SearchClient(endpoint=endpoint, index_name=index_name,
247
- credential=AzureKeyCredential(api_key))
248
-
249
- logger.info(f"Uploading {len(docs)} documents to index {index_name}")
250
- result = client.upload_documents(documents=docs)
251
-
252
- failed = [r for r in result if not r.succeeded]
253
- if failed:
254
- logger.error(f"Some documents failed to upload: {failed}")
255
- return False
256
-
257
- logger.info("Documents uploaded successfully")
258
- return True
259
-
260
- except Exception:
261
- logger.exception("AzureSearchIndexer.upload_documents failed")
262
- return False
263
-
264
- def index(self, rows: List[Dict[str, Any]]) -> bool:
265
- """High level: create index (based on first row) and upload all rows."""
266
- if not rows:
267
- logger.error("AzureSearchIndexer.index called with empty rows")
268
- return False
269
-
270
- try:
271
- if not self.validate_config():
272
- return False
273
-
274
- sample = rows[0]
275
- logger.info(f"Creating/updating index with sample: {sample}")
276
-
277
- ok = self.create_index(sample)
278
- if not ok:
279
- return False
280
-
281
- ok2 = self.upload_documents(rows)
282
- return ok2
283
-
284
- except Exception:
285
- logger.exception("AzureSearchIndexer.index failed")
286
- return False
@@ -1,22 +0,0 @@
1
- datasourcelib/__init__.py,sha256=I7JTSZ1J6ULg_TfdMEgFcd1regkCHuyKdZT4DcPtoyQ,78
2
- datasourcelib/core/__init__.py,sha256=nsXojDd97T7eMqqtCsZr1qSYLBitvKydSZRb9Dg7hqU,462
3
- datasourcelib/core/sync_base.py,sha256=AfwwaV3rJOFKVmKKpSj-BwznnCDCaeuT4LLNDfA3NAY,716
4
- datasourcelib/core/sync_manager.py,sha256=lj070S3PwSNcB0UL_ZDzDAm6uJ9G38TY491vQZ1dL3o,3849
5
- datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
6
- datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
7
- datasourcelib/datasources/azure_devops_source.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
8
- datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
9
- datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
10
- datasourcelib/datasources/datasource_types.py,sha256=eEiWymYS05X_TxwuB7P3MpphPG1En67h3kRiSGeHjQ0,176
11
- datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
12
- datasourcelib/datasources/sharepoint_source.py,sha256=Pv9735Gu2FylVeeT9e_cZlCvgGUwxn-pVRRZQe2PHU8,20196
13
- datasourcelib/datasources/sql_source.py,sha256=sCYHrmeD82fQVcdQjL9Y2TTTjaqlv2v8B5noAng3Bl4,5450
14
- datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
15
- datasourcelib/indexes/azure_search_index.py,sha256=o3BoSxURBk5jCC3AlNz-v9_igg-dXYS4yUxXZwSfqFg,17265
16
- datasourcelib/indexes/azure_search_index_only.py,sha256=SulrYPehWGaf3Wi_Dw8UvFneSY-UwEK9viVYXwIlQuI,7120
17
- datasourcelib/indexes/azure_search_index_vector.py,sha256=4By1vJHv1ORiWOpTqO5wR0sTrq1TaEHP6t8MoOINhok,13410
18
- datasourcelib-0.1.2.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
19
- datasourcelib-0.1.2.dist-info/METADATA,sha256=DOKGwf3XspFhCQRLYLod8Oqc2sUjhDaFFe15xiKqQhQ,1185
20
- datasourcelib-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
- datasourcelib-0.1.2.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
22
- datasourcelib-0.1.2.dist-info/RECORD,,