datasourcelib 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasourcelib/indexes/azure_search_index.py +102 -1
- datasourcelib/strategies/__init__.py +14 -0
- datasourcelib/strategies/daily_load.py +22 -0
- datasourcelib/strategies/full_load.py +38 -0
- datasourcelib/strategies/incremental_load.py +27 -0
- datasourcelib/strategies/ondemand_load.py +19 -0
- datasourcelib/strategies/timerange_load.py +24 -0
- datasourcelib/utils/__init__.py +12 -0
- datasourcelib/utils/byte_reader.py +256 -0
- datasourcelib/utils/exceptions.py +9 -0
- datasourcelib/utils/file_reader.py +217 -0
- datasourcelib/utils/logger.py +12 -0
- datasourcelib/utils/validators.py +7 -0
- {datasourcelib-0.1.2.dist-info → datasourcelib-0.1.4.dist-info}/METADATA +1 -1
- datasourcelib-0.1.4.dist-info/RECORD +32 -0
- datasourcelib/indexes/azure_search_index_only.py +0 -162
- datasourcelib/indexes/azure_search_index_vector.py +0 -286
- datasourcelib-0.1.2.dist-info/RECORD +0 -22
- {datasourcelib-0.1.2.dist-info → datasourcelib-0.1.4.dist-info}/WHEEL +0 -0
- {datasourcelib-0.1.2.dist-info → datasourcelib-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {datasourcelib-0.1.2.dist-info → datasourcelib-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -110,7 +110,7 @@ class AzureSearchIndexer:
|
|
|
110
110
|
logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
|
|
111
111
|
raise
|
|
112
112
|
|
|
113
|
-
def
|
|
113
|
+
def _build_vector_search_config_old(self):
|
|
114
114
|
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
|
|
115
115
|
vector_config = self.config.get("vector_config", {})
|
|
116
116
|
dimensions = vector_config.get("dimensions", 1536)
|
|
@@ -121,6 +121,107 @@ class AzureSearchIndexer:
|
|
|
121
121
|
)
|
|
122
122
|
|
|
123
123
|
return vector_search, dimensions
|
|
124
|
+
|
|
125
|
+
def _build_vector_search_config(self):
|
|
126
|
+
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, SemanticSearch, SemanticField, SemanticConfiguration, SemanticPrioritizedFields = self._ensure_sdk()
|
|
127
|
+
|
|
128
|
+
vector_config = self.config.get("vector_config", {})
|
|
129
|
+
dimensions = vector_config.get("dimensions", 1536)
|
|
130
|
+
algorithm = vector_config.get("algorithm", "hnsw").lower()
|
|
131
|
+
|
|
132
|
+
# Build algorithm configuration (SDK model if available)
|
|
133
|
+
alg_cfg = HnswAlgorithmConfiguration(name="algorithms-config-1")
|
|
134
|
+
|
|
135
|
+
# Build vectorizer settings using Azure OpenAI config from vector_db_config
|
|
136
|
+
deployment = self.config.get("embedding_deployment")
|
|
137
|
+
endpoint = self.config.get("embedding_endpoint")
|
|
138
|
+
api_key = self.config.get("embedding_key")
|
|
139
|
+
# modelName required for API version 2025-09-01 — prefer explicit embedding_model, fall back to deployment
|
|
140
|
+
model_name = self.config.get("embedding_model") or deployment
|
|
141
|
+
content_field = self.config.get("content_field", "content")
|
|
142
|
+
vector_field = self.config.get("vector_field", "contentVector")
|
|
143
|
+
|
|
144
|
+
if not model_name:
|
|
145
|
+
raise RuntimeError("Vectorizer configuration requires 'embedding_model' or 'embedding_deployment' in vector_db_config")
|
|
146
|
+
|
|
147
|
+
# Define vectorizer with explicit name and required azureOpenAIParameters including modelName
|
|
148
|
+
vectorizer_name = "azure-openai-vectorizer"
|
|
149
|
+
vectorizer = {
|
|
150
|
+
"name": vectorizer_name,
|
|
151
|
+
"kind": "azureOpenAI",
|
|
152
|
+
"azureOpenAIParameters": {
|
|
153
|
+
"resourceUri": endpoint.rstrip('/') if endpoint else None,
|
|
154
|
+
# include both modelName (required) and deploymentId (if provided)
|
|
155
|
+
"modelName": model_name,
|
|
156
|
+
**({"deploymentId": deployment} if deployment else {}),
|
|
157
|
+
"apiKey": api_key
|
|
158
|
+
},
|
|
159
|
+
"options": {
|
|
160
|
+
"fieldMapping": [
|
|
161
|
+
{
|
|
162
|
+
"sourceContext": f"/document/{content_field}",
|
|
163
|
+
"outputs": [
|
|
164
|
+
{
|
|
165
|
+
"targetContext": f"/document/{vector_field}",
|
|
166
|
+
"targetDimensions": dimensions
|
|
167
|
+
}
|
|
168
|
+
]
|
|
169
|
+
}
|
|
170
|
+
]
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
profile_name = "vector-profile-1"
|
|
175
|
+
try:
|
|
176
|
+
# Create profile with vectorizer reference (SDK may expect vectorizer_name or vectorizer depending on version)
|
|
177
|
+
try:
|
|
178
|
+
profile = VectorSearchProfile(
|
|
179
|
+
name=profile_name,
|
|
180
|
+
algorithm_configuration_name="algorithms-config-1",
|
|
181
|
+
vectorizer_name=vectorizer_name
|
|
182
|
+
)
|
|
183
|
+
except TypeError:
|
|
184
|
+
# fallback if SDK constructor uses different parameter names
|
|
185
|
+
profile = VectorSearchProfile(name=profile_name, algorithm_configuration_name="algorithms-config-1")
|
|
186
|
+
try:
|
|
187
|
+
setattr(profile, "vectorizer_name", vectorizer_name)
|
|
188
|
+
except Exception:
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
# Construct full vector search config with both profile and vectorizer
|
|
193
|
+
vector_search = VectorSearch(
|
|
194
|
+
profiles=[profile],
|
|
195
|
+
algorithms=[alg_cfg],
|
|
196
|
+
vectorizers=[vectorizer]
|
|
197
|
+
)
|
|
198
|
+
except Exception:
|
|
199
|
+
# Fallback to dict if SDK constructor differs
|
|
200
|
+
vector_search = {
|
|
201
|
+
"profiles": [{
|
|
202
|
+
"name": profile_name,
|
|
203
|
+
"algorithmConfigurationName": "algorithms-config-1",
|
|
204
|
+
"vectorizerName": vectorizer_name
|
|
205
|
+
}],
|
|
206
|
+
"algorithms": [{"name": "algorithms-config-1"}],
|
|
207
|
+
"vectorizers": [vectorizer]
|
|
208
|
+
}
|
|
209
|
+
except Exception:
|
|
210
|
+
# Full dict fallback
|
|
211
|
+
vector_search = {
|
|
212
|
+
"profiles": [{
|
|
213
|
+
"name": profile_name,
|
|
214
|
+
"algorithmConfigurationName": "algorithms-config-1",
|
|
215
|
+
"vectorizerName": vectorizer_name
|
|
216
|
+
}],
|
|
217
|
+
"algorithms": [{"name": "algorithms-config-1"}],
|
|
218
|
+
"vectorizers": [vectorizer]
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
logger.info("Built vector_search config (dimensions=%s, model=%s, vectorizer=%s)",
|
|
222
|
+
dimensions, model_name, vectorizer_name)
|
|
223
|
+
return vector_search, dimensions
|
|
224
|
+
|
|
124
225
|
|
|
125
226
|
def _build_semantic_settings(self):
|
|
126
227
|
"""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .daily_load import DailyLoadStrategy
|
|
2
|
+
from .full_load import FullLoadStrategy
|
|
3
|
+
from .incremental_load import IncrementalLoadStrategy
|
|
4
|
+
from .ondemand_load import OnDemandLoadStrategy
|
|
5
|
+
from .timerange_load import TimeRangeLoadStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"DailyLoadStrategy",
|
|
10
|
+
"FullLoadStrategy",
|
|
11
|
+
"IncrementalLoadStrategy",
|
|
12
|
+
"OnDemandLoadStrategy",
|
|
13
|
+
"TimeRangeLoadStrategy"
|
|
14
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from datasourcelib.core.sync_base import SyncBase
|
|
2
|
+
from datasourcelib.utils.logger import get_logger
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
|
|
5
|
+
logger = get_logger(__name__)
|
|
6
|
+
|
|
7
|
+
class DailyLoadStrategy(SyncBase):
|
|
8
|
+
"""Daily scheduled load (wraps incremental)."""
|
|
9
|
+
|
|
10
|
+
def validate(self) -> bool:
|
|
11
|
+
return True
|
|
12
|
+
|
|
13
|
+
def sync(self, run_date: str = None, **kwargs) -> bool:
|
|
14
|
+
try:
|
|
15
|
+
run_date = run_date or datetime.utcnow().date().isoformat()
|
|
16
|
+
logger.info("Starting daily load for %s", run_date)
|
|
17
|
+
# Typically call incremental with last_sync = previous day midnight
|
|
18
|
+
# TODO implement scheduling integration externally; the strategy here is idempotent
|
|
19
|
+
return True
|
|
20
|
+
except Exception:
|
|
21
|
+
logger.exception("DailyLoadStrategy.sync failed")
|
|
22
|
+
return False
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from datasourcelib.core.sync_base import SyncBase
|
|
2
|
+
from datasourcelib.utils.logger import get_logger
|
|
3
|
+
from datasourcelib.indexes.azure_search_index import AzureSearchIndexer
|
|
4
|
+
logger = get_logger(__name__)
|
|
5
|
+
|
|
6
|
+
class FullLoadStrategy(SyncBase):
|
|
7
|
+
"""Full load: replace or reload entire source into vector DB."""
|
|
8
|
+
|
|
9
|
+
def validate(self) -> bool:
|
|
10
|
+
# Minimal validation: required keys exist
|
|
11
|
+
dsok = self.data_source.validate_config()
|
|
12
|
+
return dsok
|
|
13
|
+
|
|
14
|
+
def sync(self, **kwargs) -> bool:
|
|
15
|
+
try:
|
|
16
|
+
logger.info("Running full data load")
|
|
17
|
+
data = self.data_source.fetch_data(**kwargs)
|
|
18
|
+
for key, value in kwargs.items():
|
|
19
|
+
print(f"{key} = {value}")
|
|
20
|
+
# Implement real extract -> transform -> load to vector DB
|
|
21
|
+
# Example pseudocode:
|
|
22
|
+
# vector_client.upsert_batch(self.vector_db_config, rows)
|
|
23
|
+
# New: use AzureSearchIndexer to create index and upload documents if requested
|
|
24
|
+
if isinstance(data, list) and data:
|
|
25
|
+
indexer = AzureSearchIndexer(self.vector_db_config or {})
|
|
26
|
+
if not indexer.validate_config():
|
|
27
|
+
logger.error("Vector DB config invalid for Azure Search indexer")
|
|
28
|
+
return False
|
|
29
|
+
ok = indexer.index(data)
|
|
30
|
+
if not ok:
|
|
31
|
+
logger.error("Indexing data to Azure Search failed")
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
logger.info("Full data load finished successfully")
|
|
35
|
+
return True
|
|
36
|
+
except Exception:
|
|
37
|
+
logger.exception("FullLoadStrategy.sync failed")
|
|
38
|
+
return False
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from datasourcelib.core.sync_base import SyncBase
|
|
3
|
+
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
|
|
5
|
+
logger = get_logger(__name__)
|
|
6
|
+
|
|
7
|
+
class IncrementalLoadStrategy(SyncBase):
|
|
8
|
+
"""Incremental load using last_sync timestamp or cursor."""
|
|
9
|
+
|
|
10
|
+
def validate(self) -> bool:
|
|
11
|
+
# require source to support incremental field or cursor
|
|
12
|
+
if "cursor_field" not in self.source_config and "last_sync" not in self.source_config:
|
|
13
|
+
logger.error("IncrementalLoadStrategy missing cursor_field or last_sync in source_config")
|
|
14
|
+
return False
|
|
15
|
+
return True
|
|
16
|
+
|
|
17
|
+
def sync(self, last_sync: str = None, **kwargs) -> bool:
|
|
18
|
+
try:
|
|
19
|
+
last = last_sync or self.source_config.get("last_sync")
|
|
20
|
+
logger.info("Running incremental load since %s", last)
|
|
21
|
+
# TODO: fetch delta rows since 'last' and upsert to vector DB
|
|
22
|
+
# After successful run store new last_sync timestamp
|
|
23
|
+
logger.info("Incremental load completed")
|
|
24
|
+
return True
|
|
25
|
+
except Exception:
|
|
26
|
+
logger.exception("IncrementalLoadStrategy.sync failed")
|
|
27
|
+
return False
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from datasourcelib.core.sync_base import SyncBase
|
|
2
|
+
from datasourcelib.utils.logger import get_logger
|
|
3
|
+
|
|
4
|
+
logger = get_logger(__name__)
|
|
5
|
+
|
|
6
|
+
class OnDemandLoadStrategy(SyncBase):
|
|
7
|
+
"""On demand load triggered by user request (arbitrary params)."""
|
|
8
|
+
|
|
9
|
+
def validate(self) -> bool:
|
|
10
|
+
return True
|
|
11
|
+
|
|
12
|
+
def sync(self, **kwargs) -> bool:
|
|
13
|
+
try:
|
|
14
|
+
logger.info("On-demand sync invoked with params: %s", kwargs)
|
|
15
|
+
# Use kwargs to drive partial loads, filters, ids etc.
|
|
16
|
+
return True
|
|
17
|
+
except Exception:
|
|
18
|
+
logger.exception("OnDemandLoadStrategy.sync failed")
|
|
19
|
+
return False
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from datasourcelib.core.sync_base import SyncBase
|
|
3
|
+
from datasourcelib.utils.logger import get_logger
|
|
4
|
+
|
|
5
|
+
logger = get_logger(__name__)
|
|
6
|
+
|
|
7
|
+
class TimeRangeLoadStrategy(SyncBase):
|
|
8
|
+
"""Load records between a start and end timestamp."""
|
|
9
|
+
|
|
10
|
+
def validate(self) -> bool:
|
|
11
|
+
# rely on params at runtime; minimal validation OK
|
|
12
|
+
return True
|
|
13
|
+
|
|
14
|
+
def sync(self, start: str = None, end: str = None, **kwargs) -> bool:
|
|
15
|
+
try:
|
|
16
|
+
if not start or not end:
|
|
17
|
+
logger.error("TimeRangeLoadStrategy requires 'start' and 'end'")
|
|
18
|
+
return False
|
|
19
|
+
logger.info("Time range load between %s and %s", start, end)
|
|
20
|
+
# TODO: query source for timeframe and upsert
|
|
21
|
+
return True
|
|
22
|
+
except Exception:
|
|
23
|
+
logger.exception("TimeRangeLoadStrategy.sync failed")
|
|
24
|
+
return False
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .byte_reader import ByteReader
|
|
2
|
+
from .exceptions import DatasourceLibError, SyncStrategyNotFound, DataSourceNotFound
|
|
3
|
+
from .file_reader import FileReader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ByteReader",
|
|
8
|
+
"FileReader",
|
|
9
|
+
"DatasourceLibError",
|
|
10
|
+
"SyncStrategyNotFound",
|
|
11
|
+
"SourceNotFound"
|
|
12
|
+
]
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional, Union, List
|
|
3
|
+
import io
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# --- Optional helpers ---
|
|
7
|
+
from charset_normalizer import from_bytes as cn_from_bytes
|
|
8
|
+
|
|
9
|
+
# DOCX
|
|
10
|
+
from docx import Document as DocxDocument
|
|
11
|
+
|
|
12
|
+
# PDF
|
|
13
|
+
import fitz # pymupdf
|
|
14
|
+
import pdfplumber
|
|
15
|
+
|
|
16
|
+
# PPTX
|
|
17
|
+
from pptx import Presentation
|
|
18
|
+
|
|
19
|
+
# YAML / XML
|
|
20
|
+
import yaml
|
|
21
|
+
from lxml import etree
|
|
22
|
+
import json
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ByteReader:
|
|
26
|
+
"""
|
|
27
|
+
Unified reader for common file types.
|
|
28
|
+
- read_text(path): file path -> text
|
|
29
|
+
- read_table(path): file path -> DataFrame
|
|
30
|
+
- read_text_from_bytes(data, ext): bytes -> text
|
|
31
|
+
- read_table_from_bytes(data, ext): bytes -> DataFrame
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
TEXT_EXTS = {".txt", ".log", ".md"}
|
|
35
|
+
TABLE_EXTS = {".csv", ".tsv", ".xlsx", ".xls"}
|
|
36
|
+
DOCX_EXTS = {".docx"}
|
|
37
|
+
PDF_EXTS = {".pdf"}
|
|
38
|
+
PPTX_EXTS = {".pptx"}
|
|
39
|
+
JSON_EXTS = {".json"}
|
|
40
|
+
YAML_EXTS = {".yaml", ".yml"}
|
|
41
|
+
INI_EXTS = {".ini", ".cfg"}
|
|
42
|
+
XML_EXTS = {".xml"}
|
|
43
|
+
|
|
44
|
+
def __init__(self, default_encoding: str = "utf-8", errors: str = "replace"):
|
|
45
|
+
self.default_encoding = default_encoding
|
|
46
|
+
self.errors = errors
|
|
47
|
+
|
|
48
|
+
# -----------------------
|
|
49
|
+
# Public API (paths)
|
|
50
|
+
# -----------------------
|
|
51
|
+
def read_text(self, path: Union[str, Path]) -> str:
|
|
52
|
+
path = Path(path)
|
|
53
|
+
ext = path.suffix.lower()
|
|
54
|
+
|
|
55
|
+
if ext in self.TEXT_EXTS:
|
|
56
|
+
return path.read_text(encoding=self.default_encoding, errors=self.errors)
|
|
57
|
+
|
|
58
|
+
if ext in self.PDF_EXTS:
|
|
59
|
+
return self._read_pdf_text_path(path)
|
|
60
|
+
|
|
61
|
+
if ext in self.DOCX_EXTS:
|
|
62
|
+
return self._read_docx_text_fp(open(path, "rb"))
|
|
63
|
+
|
|
64
|
+
if ext in self.PPTX_EXTS:
|
|
65
|
+
return self._read_pptx_text_fp(open(path, "rb"))
|
|
66
|
+
|
|
67
|
+
if ext in self.JSON_EXTS:
|
|
68
|
+
with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
|
|
69
|
+
obj = json.load(f)
|
|
70
|
+
return json.dumps(obj, indent=2, ensure_ascii=False)
|
|
71
|
+
|
|
72
|
+
if ext in self.YAML_EXTS:
|
|
73
|
+
with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
|
|
74
|
+
obj = yaml.safe_load(f)
|
|
75
|
+
return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
|
|
76
|
+
|
|
77
|
+
if ext in self.INI_EXTS:
|
|
78
|
+
import configparser
|
|
79
|
+
parser = configparser.ConfigParser()
|
|
80
|
+
with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
|
|
81
|
+
parser.read_file(f)
|
|
82
|
+
output = io.StringIO()
|
|
83
|
+
parser.write(output)
|
|
84
|
+
return output.getvalue()
|
|
85
|
+
|
|
86
|
+
if ext in self.XML_EXTS:
|
|
87
|
+
tree = etree.parse(str(path))
|
|
88
|
+
return etree.tostring(tree, pretty_print=True, encoding="unicode")
|
|
89
|
+
|
|
90
|
+
if ext in self.TABLE_EXTS:
|
|
91
|
+
df = self.read_table(path)
|
|
92
|
+
return df.to_csv(index=False)
|
|
93
|
+
|
|
94
|
+
raise ValueError(f"Unsupported file extension for text extraction: {ext}")
|
|
95
|
+
|
|
96
|
+
def read_table(self, path: Union[str, Path]) -> pd.DataFrame:
|
|
97
|
+
path = Path(path)
|
|
98
|
+
ext = path.suffix.lower()
|
|
99
|
+
|
|
100
|
+
if ext == ".csv":
|
|
101
|
+
return pd.read_csv(path)
|
|
102
|
+
if ext == ".tsv":
|
|
103
|
+
return pd.read_csv(path, sep="\t")
|
|
104
|
+
if ext == ".xlsx":
|
|
105
|
+
return pd.read_excel(path, engine="openpyxl")
|
|
106
|
+
if ext == ".xls":
|
|
107
|
+
return pd.read_excel(path, engine="xlrd")
|
|
108
|
+
|
|
109
|
+
# Fallback: attempt CSV read if unknown
|
|
110
|
+
try:
|
|
111
|
+
return pd.read_csv(path)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
raise ValueError(f"Unsupported file extension for tables: {ext}") from e
|
|
114
|
+
|
|
115
|
+
# -----------------------
|
|
116
|
+
# Public API (bytes)
|
|
117
|
+
# -----------------------
|
|
118
|
+
def read_text_from_bytes(self, data: bytes, ext: str) -> str:
|
|
119
|
+
"""
|
|
120
|
+
Extract text from in-memory bytes.
|
|
121
|
+
ext: file extension (e.g., '.pdf', '.docx', '.txt', '.pptx', '.json', '.yaml', '.xml', '.csv', '.xlsx')
|
|
122
|
+
"""
|
|
123
|
+
ext = self._normalize_ext(ext)
|
|
124
|
+
|
|
125
|
+
if ext in self.TEXT_EXTS:
|
|
126
|
+
# Robust encoding detection
|
|
127
|
+
res = cn_from_bytes(data).best()
|
|
128
|
+
return str(res) if res else data.decode(self.default_encoding, errors=self.errors)
|
|
129
|
+
|
|
130
|
+
if ext in self.PDF_EXTS:
|
|
131
|
+
return self._read_pdf_text_bytes(data)
|
|
132
|
+
|
|
133
|
+
if ext in self.DOCX_EXTS:
|
|
134
|
+
return self._read_docx_text_fp(io.BytesIO(data))
|
|
135
|
+
|
|
136
|
+
if ext in self.PPTX_EXTS:
|
|
137
|
+
return self._read_pptx_text_fp(io.BytesIO(data))
|
|
138
|
+
|
|
139
|
+
if ext in self.JSON_EXTS:
|
|
140
|
+
obj = json.loads(data.decode(self.default_encoding, errors=self.errors))
|
|
141
|
+
return json.dumps(obj, indent=2, ensure_ascii=False)
|
|
142
|
+
|
|
143
|
+
if ext in self.YAML_EXTS:
|
|
144
|
+
obj = yaml.safe_load(data.decode(self.default_encoding, errors=self.errors))
|
|
145
|
+
return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
|
|
146
|
+
|
|
147
|
+
if ext in self.INI_EXTS:
|
|
148
|
+
import configparser
|
|
149
|
+
parser = configparser.ConfigParser()
|
|
150
|
+
parser.read_string(data.decode(self.default_encoding, errors=self.errors))
|
|
151
|
+
output = io.StringIO()
|
|
152
|
+
parser.write(output)
|
|
153
|
+
return output.getvalue()
|
|
154
|
+
|
|
155
|
+
if ext in self.XML_EXTS:
|
|
156
|
+
tree = etree.parse(io.BytesIO(data))
|
|
157
|
+
return etree.tostring(tree, pretty_print=True, encoding="unicode")
|
|
158
|
+
|
|
159
|
+
if ext in self.TABLE_EXTS:
|
|
160
|
+
df = self.read_table_from_bytes(data, ext)
|
|
161
|
+
return df.to_csv(index=False)
|
|
162
|
+
|
|
163
|
+
raise ValueError(f"Unsupported extension for text extraction from bytes: {ext}")
|
|
164
|
+
|
|
165
|
+
def read_table_from_bytes(self, data: bytes, ext: str) -> pd.DataFrame:
|
|
166
|
+
"""
|
|
167
|
+
Load tabular data from in-memory bytes into a DataFrame.
|
|
168
|
+
"""
|
|
169
|
+
ext = self._normalize_ext(ext)
|
|
170
|
+
|
|
171
|
+
if ext == ".csv":
|
|
172
|
+
return pd.read_csv(io.BytesIO(data))
|
|
173
|
+
if ext == ".tsv":
|
|
174
|
+
return pd.read_csv(io.BytesIO(data), sep="\t")
|
|
175
|
+
if ext == ".xlsx":
|
|
176
|
+
return pd.read_excel(io.BytesIO(data), engine="openpyxl")
|
|
177
|
+
if ext == ".xls":
|
|
178
|
+
return pd.read_excel(io.BytesIO(data), engine="xlrd")
|
|
179
|
+
|
|
180
|
+
# Opportunistic fallback: try CSV
|
|
181
|
+
try:
|
|
182
|
+
return pd.read_csv(io.BytesIO(data))
|
|
183
|
+
except Exception as e:
|
|
184
|
+
raise ValueError(f"Unsupported extension for table reading from bytes: {ext}") from e
|
|
185
|
+
|
|
186
|
+
# -----------------------
|
|
187
|
+
# Internal helpers
|
|
188
|
+
# -----------------------
|
|
189
|
+
def _normalize_ext(self, ext: str) -> str:
|
|
190
|
+
ext = (ext or "").strip().lower()
|
|
191
|
+
if not ext.startswith("."):
|
|
192
|
+
ext = "." + ext
|
|
193
|
+
return ext
|
|
194
|
+
|
|
195
|
+
def _read_pdf_text_path(self, path: Path) -> str:
|
|
196
|
+
# Prefer PyMuPDF
|
|
197
|
+
try:
|
|
198
|
+
parts: List[str] = []
|
|
199
|
+
with fitz.open(str(path)) as doc:
|
|
200
|
+
if doc.is_encrypted and not doc.authenticate(""):
|
|
201
|
+
raise RuntimeError("Encrypted PDF requires a password.")
|
|
202
|
+
for page in doc:
|
|
203
|
+
parts.append(page.get_text("text"))
|
|
204
|
+
text = "\n\n".join(parts).strip()
|
|
205
|
+
if text:
|
|
206
|
+
return text
|
|
207
|
+
except Exception:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
# Fallback: pdfplumber
|
|
211
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
212
|
+
return "\n\n".join([(p.extract_text() or "") for p in pdf.pages]).strip()
|
|
213
|
+
|
|
214
|
+
def _read_pdf_text_bytes(self, data: bytes) -> str:
|
|
215
|
+
# PyMuPDF can open from bytes
|
|
216
|
+
try:
|
|
217
|
+
doc = fitz.open(stream=data, filetype="pdf")
|
|
218
|
+
parts: List[str] = []
|
|
219
|
+
if doc.is_encrypted and not doc.authenticate(""):
|
|
220
|
+
raise RuntimeError("Encrypted PDF requires a password.")
|
|
221
|
+
for page in doc:
|
|
222
|
+
parts.append(page.get_text("text"))
|
|
223
|
+
doc.close()
|
|
224
|
+
text = "\n\n".join(parts).strip()
|
|
225
|
+
if text:
|
|
226
|
+
return text
|
|
227
|
+
except Exception:
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
# Fallback to pdfplumber from BytesIO
|
|
231
|
+
with pdfplumber.open(io.BytesIO(data)) as pdf:
|
|
232
|
+
return "\n\n".join([(p.extract_text() or "") for p in pdf.pages]).strip()
|
|
233
|
+
|
|
234
|
+
def _read_docx_text_fp(self, fp) -> str:
|
|
235
|
+
doc = DocxDocument(fp)
|
|
236
|
+
chunks = []
|
|
237
|
+
for p in doc.paragraphs:
|
|
238
|
+
if p.text:
|
|
239
|
+
chunks.append(p.text)
|
|
240
|
+
for table in doc.tables:
|
|
241
|
+
for row in table.rows:
|
|
242
|
+
cells = [cell.text.strip() for cell in row.cells]
|
|
243
|
+
if any(cells):
|
|
244
|
+
chunks.append("\t".join(cells))
|
|
245
|
+
return "\n".join(chunks).strip()
|
|
246
|
+
|
|
247
|
+
def _read_pptx_text_fp(self, fp) -> str:
|
|
248
|
+
prs = Presentation(fp)
|
|
249
|
+
chunks = []
|
|
250
|
+
for slide in prs.slides:
|
|
251
|
+
for shape in slide.shapes:
|
|
252
|
+
if hasattr(shape, "has_text_frame") and shape.has_text_frame:
|
|
253
|
+
text = shape.text or ""
|
|
254
|
+
if text:
|
|
255
|
+
chunks.append(text)
|
|
256
|
+
return "\n".join(chunks).strip()
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
class DatasourceLibError(Exception):
|
|
2
|
+
"""Base exception for datasourcelib."""
|
|
3
|
+
|
|
4
|
+
class SyncStrategyNotFound(DatasourceLibError):
|
|
5
|
+
"""Raised when a strategy is not found."""
|
|
6
|
+
|
|
7
|
+
# Added: DataSourceNotFound to represent missing/unknown data sources
|
|
8
|
+
class DataSourceNotFound(DatasourceLibError):
|
|
9
|
+
"""Raised when a data source is not found or not registered."""
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional, Union, List
|
|
3
|
+
import io
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# --- Optional helpers ---
|
|
7
|
+
from charset_normalizer import from_path as cn_from_path
|
|
8
|
+
|
|
9
|
+
# DOCX
|
|
10
|
+
from docx import Document as DocxDocument
|
|
11
|
+
|
|
12
|
+
# PDF
|
|
13
|
+
import fitz # pymupdf
|
|
14
|
+
import pdfplumber
|
|
15
|
+
|
|
16
|
+
# PPTX
|
|
17
|
+
from pptx import Presentation
|
|
18
|
+
|
|
19
|
+
# YAML / XML
|
|
20
|
+
import yaml
|
|
21
|
+
from lxml import etree
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class FileReader:
|
|
25
|
+
"""
|
|
26
|
+
A unified reader for common file types.
|
|
27
|
+
- read_text(path): extract text from txt, pdf, docx, pptx, json, yaml, ini, xml
|
|
28
|
+
- read_table(path): load tabular data from csv, tsv, xlsx/xls
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
TEXT_EXTS = {".txt", ".log", ".md"}
|
|
32
|
+
TABLE_EXTS = {".csv", ".tsv", ".xlsx", ".xls"}
|
|
33
|
+
DOCX_EXTS = {".docx"}
|
|
34
|
+
PDF_EXTS = {".pdf"}
|
|
35
|
+
PPTX_EXTS = {".pptx"}
|
|
36
|
+
JSON_EXTS = {".json"}
|
|
37
|
+
YAML_EXTS = {".yaml", ".yml"}
|
|
38
|
+
INI_EXTS = {".ini", ".cfg"}
|
|
39
|
+
XML_EXTS = {".xml"}
|
|
40
|
+
|
|
41
|
+
def __init__(self, default_encoding: str = "utf-8", errors: str = "replace"):
|
|
42
|
+
self.default_encoding = default_encoding
|
|
43
|
+
self.errors = errors
|
|
44
|
+
|
|
45
|
+
# -----------------------
|
|
46
|
+
# Public API
|
|
47
|
+
# -----------------------
|
|
48
|
+
def read_text(self, path: Union[str, Path]) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Extract best-effort text from a given file based on extension.
|
|
51
|
+
"""
|
|
52
|
+
path = Path(path)
|
|
53
|
+
ext = path.suffix.lower()
|
|
54
|
+
|
|
55
|
+
if ext in self.TEXT_EXTS:
|
|
56
|
+
return self._read_plain_text(path)
|
|
57
|
+
|
|
58
|
+
if ext in self.PDF_EXTS:
|
|
59
|
+
return self._read_pdf_text(path)
|
|
60
|
+
|
|
61
|
+
if ext in self.DOCX_EXTS:
|
|
62
|
+
return self._read_docx_text(path)
|
|
63
|
+
|
|
64
|
+
if ext in self.PPTX_EXTS:
|
|
65
|
+
return self._read_pptx_text(path)
|
|
66
|
+
|
|
67
|
+
if ext in self.JSON_EXTS:
|
|
68
|
+
return self._read_json_text(path)
|
|
69
|
+
|
|
70
|
+
if ext in self.YAML_EXTS:
|
|
71
|
+
return self._read_yaml_text(path)
|
|
72
|
+
|
|
73
|
+
if ext in self.INI_EXTS:
|
|
74
|
+
return self._read_ini_text(path)
|
|
75
|
+
|
|
76
|
+
if ext in self.XML_EXTS:
|
|
77
|
+
return self._read_xml_text(path)
|
|
78
|
+
|
|
79
|
+
if ext in self.TABLE_EXTS:
|
|
80
|
+
# For tabular files, provide a quick text representation
|
|
81
|
+
df = self.read_table(path)
|
|
82
|
+
return df.to_csv(index=False)
|
|
83
|
+
|
|
84
|
+
raise ValueError(f"Unsupported file extension for text extraction: {ext}")
|
|
85
|
+
|
|
86
|
+
def read_table(self, path: Union[str, Path]) -> pd.DataFrame:
|
|
87
|
+
"""
|
|
88
|
+
Load tabular data from CSV/TSV/Excel, returning a DataFrame.
|
|
89
|
+
"""
|
|
90
|
+
path = Path(path)
|
|
91
|
+
ext = path.suffix.lower()
|
|
92
|
+
|
|
93
|
+
if ext == ".csv":
|
|
94
|
+
return pd.read_csv(path)
|
|
95
|
+
if ext == ".tsv":
|
|
96
|
+
return pd.read_csv(path, sep="\t")
|
|
97
|
+
if ext == ".xlsx":
|
|
98
|
+
return pd.read_excel(path, engine="openpyxl")
|
|
99
|
+
if ext == ".xls":
|
|
100
|
+
return pd.read_excel(path, engine="xlrd")
|
|
101
|
+
|
|
102
|
+
# Fallback: attempt CSV read if unknown
|
|
103
|
+
try:
|
|
104
|
+
return pd.read_csv(path)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
raise ValueError(f"Unsupported file extension for tables: {ext}") from e
|
|
107
|
+
|
|
108
|
+
# -----------------------
|
|
109
|
+
# Text readers
|
|
110
|
+
# -----------------------
|
|
111
|
+
def _read_plain_text(self, path: Path) -> str:
|
|
112
|
+
# Detect encoding for robustness
|
|
113
|
+
res = cn_from_path(str(path)).best()
|
|
114
|
+
if res:
|
|
115
|
+
return str(res)
|
|
116
|
+
# Fallback to configured defaults
|
|
117
|
+
return path.read_text(encoding=self.default_encoding, errors=self.errors)
|
|
118
|
+
|
|
119
|
+
def _read_pdf_text(self, path: Path) -> str:
|
|
120
|
+
# Try PyMuPDF (fast, layout-aware)
|
|
121
|
+
try:
|
|
122
|
+
text_parts: List[str] = []
|
|
123
|
+
with fitz.open(str(path)) as doc:
|
|
124
|
+
if doc.can_save and doc.is_encrypted:
|
|
125
|
+
# If encrypted and requires a password, this will fail to extract text.
|
|
126
|
+
if not doc.authenticate(""):
|
|
127
|
+
raise RuntimeError("Encrypted PDF requires a password.")
|
|
128
|
+
for page in doc:
|
|
129
|
+
text_parts.append(page.get_text("text"))
|
|
130
|
+
text = "\n".join(text_parts).strip()
|
|
131
|
+
if text:
|
|
132
|
+
return text
|
|
133
|
+
except Exception:
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
# Fallback to pdfplumber (good for tables/structured text)
|
|
137
|
+
try:
|
|
138
|
+
text_parts = []
|
|
139
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
140
|
+
for page in pdf.pages:
|
|
141
|
+
t = page.extract_text() or ""
|
|
142
|
+
text_parts.append(t)
|
|
143
|
+
return "\n".join(text_parts).strip()
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise RuntimeError(f"Failed to read PDF: {e}") from e
|
|
146
|
+
|
|
147
|
+
def _read_docx_text(self, path: Path) -> str:
|
|
148
|
+
doc = DocxDocument(str(path))
|
|
149
|
+
chunks = []
|
|
150
|
+
# Paragraphs
|
|
151
|
+
for p in doc.paragraphs:
|
|
152
|
+
if p.text:
|
|
153
|
+
chunks.append(p.text)
|
|
154
|
+
# Tables (optional: include)
|
|
155
|
+
for table in doc.tables:
|
|
156
|
+
for row in table.rows:
|
|
157
|
+
cells = [cell.text.strip() for cell in row.cells]
|
|
158
|
+
if any(cells):
|
|
159
|
+
chunks.append("\t".join(cells))
|
|
160
|
+
return "\n".join(chunks).strip()
|
|
161
|
+
|
|
162
|
+
def _read_pptx_text(self, path: Path) -> str:
|
|
163
|
+
prs = Presentation(str(path))
|
|
164
|
+
chunks = []
|
|
165
|
+
for slide in prs.slides:
|
|
166
|
+
for shape in slide.shapes:
|
|
167
|
+
if hasattr(shape, "text") and shape.has_text_frame:
|
|
168
|
+
text = shape.text if hasattr(shape, "text") else ""
|
|
169
|
+
if text:
|
|
170
|
+
chunks.append(text)
|
|
171
|
+
return "\n".join(chunks).strip()
|
|
172
|
+
|
|
173
|
+
def _read_json_text(self, path: Path) -> str:
|
|
174
|
+
import json
|
|
175
|
+
with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
|
|
176
|
+
obj = json.load(f)
|
|
177
|
+
# Pretty-print
|
|
178
|
+
return json.dumps(obj, indent=2, ensure_ascii=False)
|
|
179
|
+
|
|
180
|
+
def _read_yaml_text(self, path: Path) -> str:
|
|
181
|
+
with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
|
|
182
|
+
obj = yaml.safe_load(f)
|
|
183
|
+
return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
|
|
184
|
+
|
|
185
|
+
def _read_ini_text(self, path: Path) -> str:
|
|
186
|
+
import configparser
|
|
187
|
+
parser = configparser.ConfigParser()
|
|
188
|
+
with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
|
|
189
|
+
# INI files might have duplicate keys; defaults handle many cases
|
|
190
|
+
parser.read_file(f)
|
|
191
|
+
output = io.StringIO()
|
|
192
|
+
parser.write(output)
|
|
193
|
+
return output.getvalue()
|
|
194
|
+
|
|
195
|
+
def _read_xml_text(self, path: Path) -> str:
|
|
196
|
+
# Pretty-print XML
|
|
197
|
+
tree = etree.parse(str(path))
|
|
198
|
+
return etree.tostring(tree, pretty_print=True, encoding="unicode")
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# -----------------------
|
|
202
|
+
# Example usage
|
|
203
|
+
# -----------------------
|
|
204
|
+
#if __name__ == "__main__":
|
|
205
|
+
# reader = FileReader()
|
|
206
|
+
|
|
207
|
+
# 1) Extract text
|
|
208
|
+
# print(reader.read_text("document.pdf"))
|
|
209
|
+
# print(reader.read_text("report.docx"))
|
|
210
|
+
# print(reader.read_text("slides.pptx"))
|
|
211
|
+
# print(reader.read_text("notes.txt"))
|
|
212
|
+
# print(reader.read_text("config.yaml"))
|
|
213
|
+
# print(reader.read_text("data.xml"))
|
|
214
|
+
|
|
215
|
+
# 2) Load tabular data
|
|
216
|
+
# df = reader.read_table("data.xlsx")
|
|
217
|
+
# print(df.head())
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
def get_logger(name: Optional[str] = None) -> logging.Logger:
|
|
5
|
+
logger = logging.getLogger(name or __name__)
|
|
6
|
+
if not logger.handlers:
|
|
7
|
+
h = logging.StreamHandler()
|
|
8
|
+
fmt = "%(asctime)s %(levelname)s %(name)s %(message)s"
|
|
9
|
+
h.setFormatter(logging.Formatter(fmt))
|
|
10
|
+
logger.addHandler(h)
|
|
11
|
+
logger.setLevel(logging.INFO)
|
|
12
|
+
return logger
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
datasourcelib/__init__.py,sha256=I7JTSZ1J6ULg_TfdMEgFcd1regkCHuyKdZT4DcPtoyQ,78
|
|
2
|
+
datasourcelib/core/__init__.py,sha256=nsXojDd97T7eMqqtCsZr1qSYLBitvKydSZRb9Dg7hqU,462
|
|
3
|
+
datasourcelib/core/sync_base.py,sha256=AfwwaV3rJOFKVmKKpSj-BwznnCDCaeuT4LLNDfA3NAY,716
|
|
4
|
+
datasourcelib/core/sync_manager.py,sha256=lj070S3PwSNcB0UL_ZDzDAm6uJ9G38TY491vQZ1dL3o,3849
|
|
5
|
+
datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
|
|
6
|
+
datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
|
|
7
|
+
datasourcelib/datasources/azure_devops_source.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
|
|
8
|
+
datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
|
|
9
|
+
datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
|
|
10
|
+
datasourcelib/datasources/datasource_types.py,sha256=eEiWymYS05X_TxwuB7P3MpphPG1En67h3kRiSGeHjQ0,176
|
|
11
|
+
datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
|
|
12
|
+
datasourcelib/datasources/sharepoint_source.py,sha256=Pv9735Gu2FylVeeT9e_cZlCvgGUwxn-pVRRZQe2PHU8,20196
|
|
13
|
+
datasourcelib/datasources/sql_source.py,sha256=sCYHrmeD82fQVcdQjL9Y2TTTjaqlv2v8B5noAng3Bl4,5450
|
|
14
|
+
datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
|
|
15
|
+
datasourcelib/indexes/azure_search_index.py,sha256=kznAz06UXgyT1Clqj6gRhnBQ5HFw40ZQHJElRFIcbRo,22115
|
|
16
|
+
datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
|
|
17
|
+
datasourcelib/strategies/daily_load.py,sha256=Rh-veUhxKYsplwHTyko_Zp9C6NkUJV5VAGtg-p7Iy34,856
|
|
18
|
+
datasourcelib/strategies/full_load.py,sha256=U1a9wO_ZLRnMInvU0IRW-ZKnhu0Cv437VcNMKIYuzMA,1691
|
|
19
|
+
datasourcelib/strategies/incremental_load.py,sha256=TVqmDLu3m571nqGvzo_69i36QtYe4sBpllFwfPNL0TE,1178
|
|
20
|
+
datasourcelib/strategies/ondemand_load.py,sha256=VxzAYgrW2ebTOC3xm61CerL2AFehZUJLnKrqtGRGJoE,644
|
|
21
|
+
datasourcelib/strategies/timerange_load.py,sha256=c62BN2yXwVFaA_dQV54qenP4vrb4rcFqbx6m-nqhaTA,900
|
|
22
|
+
datasourcelib/utils/__init__.py,sha256=9pSIpaK-kdmNuDzwl0Z7QU-_lV3cZE-iwOEPh3RBBTs,298
|
|
23
|
+
datasourcelib/utils/byte_reader.py,sha256=GaoPXwJa2YTWG1Kim0K6JG20eVSaWkZJd1o9bswxHmc,9082
|
|
24
|
+
datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4fYQww,369
|
|
25
|
+
datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
|
|
26
|
+
datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
|
|
27
|
+
datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
|
|
28
|
+
datasourcelib-0.1.4.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
|
|
29
|
+
datasourcelib-0.1.4.dist-info/METADATA,sha256=LR3db7O_rnbTmF_owLl-lH06xAfP-iZu4aXPtmjVtRo,1185
|
|
30
|
+
datasourcelib-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
datasourcelib-0.1.4.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
|
|
32
|
+
datasourcelib-0.1.4.dist-info/RECORD,,
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
from typing import List, Dict, Any, Optional
|
|
2
|
-
from datasourcelib.utils.logger import get_logger
|
|
3
|
-
|
|
4
|
-
logger = get_logger(__name__)
|
|
5
|
-
|
|
6
|
-
class AzureSearchIndexer:
|
|
7
|
-
"""
|
|
8
|
-
Minimal Azure Cognitive Search indexer wrapper.
|
|
9
|
-
Expects vector_db_config with:
|
|
10
|
-
- service_endpoint: str
|
|
11
|
-
- index_name: str
|
|
12
|
-
- api_key: str
|
|
13
|
-
Optional:
|
|
14
|
-
- key_field: name of unique key in documents (default 'id')
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
def __init__(self, vector_db_config: Dict[str, Any]):
|
|
18
|
-
self.config = vector_db_config or {}
|
|
19
|
-
self._client = None
|
|
20
|
-
self._index_client = None
|
|
21
|
-
|
|
22
|
-
def validate_config(self) -> bool:
|
|
23
|
-
required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
|
|
24
|
-
missing = [k for k in required if k not in self.config]
|
|
25
|
-
if missing:
|
|
26
|
-
logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
|
|
27
|
-
return False
|
|
28
|
-
return True
|
|
29
|
-
|
|
30
|
-
def _ensure_sdk(self):
|
|
31
|
-
try:
|
|
32
|
-
from azure.core.credentials import AzureKeyCredential # type: ignore
|
|
33
|
-
from azure.search.documents import SearchClient # type: ignore
|
|
34
|
-
from azure.search.documents.indexes import SearchIndexClient # type: ignore
|
|
35
|
-
from azure.search.documents.indexes.models import (
|
|
36
|
-
SearchIndex,
|
|
37
|
-
SimpleField,
|
|
38
|
-
SearchableField,
|
|
39
|
-
SearchFieldDataType,
|
|
40
|
-
) # type: ignore
|
|
41
|
-
except Exception as e:
|
|
42
|
-
raise RuntimeError("azure-search-documents package is required: install azure-search-documents") from e
|
|
43
|
-
|
|
44
|
-
return AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType
|
|
45
|
-
|
|
46
|
-
def _infer_field_type(self, value) -> Any:
|
|
47
|
-
"""
|
|
48
|
-
Map Python types to SearchFieldDataType
|
|
49
|
-
"""
|
|
50
|
-
*_, SearchFieldDataType = self._ensure_sdk()
|
|
51
|
-
if value is None:
|
|
52
|
-
return SearchFieldDataType.String
|
|
53
|
-
t = type(value)
|
|
54
|
-
if t is str:
|
|
55
|
-
return SearchFieldDataType.String
|
|
56
|
-
if t is bool:
|
|
57
|
-
return SearchFieldDataType.Boolean
|
|
58
|
-
if t is int:
|
|
59
|
-
return SearchFieldDataType.Int32
|
|
60
|
-
if t is float:
|
|
61
|
-
return SearchFieldDataType.Double
|
|
62
|
-
# fallback to string
|
|
63
|
-
return SearchFieldDataType.String
|
|
64
|
-
|
|
65
|
-
def _build_fields(self, sample: Dict[str, Any], key_field: str):
|
|
66
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
|
|
67
|
-
|
|
68
|
-
fields = []
|
|
69
|
-
# ensure key field present
|
|
70
|
-
if key_field not in sample:
|
|
71
|
-
# we'll create a string key, uploader will populate unique ids
|
|
72
|
-
fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
|
|
73
|
-
else:
|
|
74
|
-
typ = self._infer_field_type(sample[key_field])
|
|
75
|
-
fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
|
|
76
|
-
|
|
77
|
-
for k, v in sample.items():
|
|
78
|
-
logger.info(f"================={k}============")
|
|
79
|
-
if k == key_field:
|
|
80
|
-
continue
|
|
81
|
-
typ = self._infer_field_type(v)
|
|
82
|
-
# for strings use SearchableField so full text queries work
|
|
83
|
-
if typ == SearchFieldDataType.String:
|
|
84
|
-
fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
|
|
85
|
-
else:
|
|
86
|
-
fields.append(SimpleField(name=k, type=typ))
|
|
87
|
-
return fields
|
|
88
|
-
|
|
89
|
-
def create_index(self, sample: Dict[str, Any]) -> bool:
|
|
90
|
-
try:
|
|
91
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
|
|
92
|
-
endpoint = self.config["aisearch_endpoint"]
|
|
93
|
-
api_key = self.config["aisearch_api_key"]
|
|
94
|
-
index_name = self.config["aisearch_index_name"]
|
|
95
|
-
key_field = self.config.get("key_field", "id")
|
|
96
|
-
|
|
97
|
-
index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
|
|
98
|
-
fields = self._build_fields(sample, key_field)
|
|
99
|
-
logger.info("=================Creating Index============")
|
|
100
|
-
index = SearchIndex(name=index_name, fields=fields)
|
|
101
|
-
# create or update index
|
|
102
|
-
index_client.create_or_update_index(index)
|
|
103
|
-
logger.info("Azure Search index '%s' created/updated", index_name)
|
|
104
|
-
return True
|
|
105
|
-
except Exception as ex:
|
|
106
|
-
logger.exception("AzureSearchIndexer.create_index failed")
|
|
107
|
-
return False
|
|
108
|
-
|
|
109
|
-
def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
|
|
110
|
-
try:
|
|
111
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, SearchIndex, SimpleField, SearchableField, SearchFieldDataType = self._ensure_sdk()
|
|
112
|
-
endpoint = self.config["aisearch_endpoint"]
|
|
113
|
-
api_key = self.config["aisearch_api_key"]
|
|
114
|
-
index_name = self.config["aisearch_index_name"]
|
|
115
|
-
key_field = self.config.get("key_field", "id")
|
|
116
|
-
|
|
117
|
-
# ensure each doc has key_field
|
|
118
|
-
from uuid import uuid4
|
|
119
|
-
for d in docs:
|
|
120
|
-
if key_field not in d:
|
|
121
|
-
d[key_field] = str(uuid4())
|
|
122
|
-
# ensure each doc has key_field is of string type
|
|
123
|
-
for d in docs:
|
|
124
|
-
if key_field in d:
|
|
125
|
-
typ = self._infer_field_type(d[key_field])
|
|
126
|
-
if typ != SearchFieldDataType.String:
|
|
127
|
-
d[key_field] = str(d[key_field])
|
|
128
|
-
|
|
129
|
-
client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))
|
|
130
|
-
logger.info("Uploading %d documents to index %s", len(docs), index_name)
|
|
131
|
-
result = client.upload_documents(documents=docs)
|
|
132
|
-
# Check results for failures
|
|
133
|
-
failed = [r for r in result if not r.succeeded]
|
|
134
|
-
if failed:
|
|
135
|
-
logger.error("Some documents failed to upload: %s", failed)
|
|
136
|
-
return False
|
|
137
|
-
logger.info("Uploaded documents successfully")
|
|
138
|
-
return True
|
|
139
|
-
except Exception:
|
|
140
|
-
logger.exception("AzureSearchIndexer.upload_documents failed")
|
|
141
|
-
return False
|
|
142
|
-
|
|
143
|
-
def index(self, rows: List[Dict[str, Any]]) -> bool:
|
|
144
|
-
"""
|
|
145
|
-
High level: create index (based on first row) and upload all rows.
|
|
146
|
-
"""
|
|
147
|
-
if not rows:
|
|
148
|
-
logger.error("AzureSearchIndexer.index called with empty rows")
|
|
149
|
-
return False
|
|
150
|
-
try:
|
|
151
|
-
if not self.validate_config():
|
|
152
|
-
return False
|
|
153
|
-
sample = rows[0]
|
|
154
|
-
logger.info(f"================={sample}============")
|
|
155
|
-
ok = self.create_index(sample)
|
|
156
|
-
if not ok:
|
|
157
|
-
return False
|
|
158
|
-
ok2 = self.upload_documents(rows)
|
|
159
|
-
return ok2
|
|
160
|
-
except Exception:
|
|
161
|
-
logger.exception("AzureSearchIndexer.index failed")
|
|
162
|
-
return False
|
|
@@ -1,286 +0,0 @@
|
|
|
1
|
-
from typing import List, Dict, Any, Optional
|
|
2
|
-
from datasourcelib.utils.logger import get_logger
|
|
3
|
-
|
|
4
|
-
logger = get_logger(__name__)
|
|
5
|
-
|
|
6
|
-
class AzureSearchIndexer:
|
|
7
|
-
"""
|
|
8
|
-
Azure Cognitive Search indexer with vector search support.
|
|
9
|
-
Required vector_db_config:
|
|
10
|
-
- aisearch_endpoint: str
|
|
11
|
-
- aisearch_index_name: str
|
|
12
|
-
- aisearch_api_key
|
|
13
|
-
|
|
14
|
-
Optional vector search config:
|
|
15
|
-
- vectorization: bool (enable vector search)
|
|
16
|
-
- vector_config: dict
|
|
17
|
-
- dimensions: int (default 1024)
|
|
18
|
-
- algorithm: str ('hnsw' or 'flat', default 'hnsw')
|
|
19
|
-
- metric: str ('cosine', 'euclidean', 'dotProduct', default 'cosine')
|
|
20
|
-
- key_field: str (default 'id')
|
|
21
|
-
- vector_field: str (default 'contentVector')
|
|
22
|
-
- embedding_endpoint: str (Azure OpenAI endpoint for embeddings)
|
|
23
|
-
- embedding_key: str (Azure OpenAI API key)
|
|
24
|
-
- embedding_deployment: str (Azure OpenAI model deployment name)
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
def __init__(self, vector_db_config: Dict[str, Any]):
|
|
28
|
-
self.config = vector_db_config or {}
|
|
29
|
-
self._client = None
|
|
30
|
-
self._index_client = None
|
|
31
|
-
self._embedding_client = None
|
|
32
|
-
|
|
33
|
-
def validate_config(self) -> bool:
|
|
34
|
-
required = ("aisearch_endpoint", "aisearch_index_name", "aisearch_api_key")
|
|
35
|
-
missing = [k for k in required if k not in self.config]
|
|
36
|
-
|
|
37
|
-
# Check vector search requirements if enabled
|
|
38
|
-
if self.config.get("vectorization", False):
|
|
39
|
-
vector_required = ("embedding_endpoint", "embedding_key", "embedding_deployment")
|
|
40
|
-
missing.extend([k for k in vector_required if k not in self.config])
|
|
41
|
-
|
|
42
|
-
if missing:
|
|
43
|
-
logger.error("AzureSearchIndexer.validate_config missing: %s", missing)
|
|
44
|
-
return False
|
|
45
|
-
return True
|
|
46
|
-
|
|
47
|
-
def _ensure_sdk(self):
|
|
48
|
-
try:
|
|
49
|
-
from azure.core.credentials import AzureKeyCredential # type: ignore
|
|
50
|
-
from azure.search.documents import SearchClient # type: ignore
|
|
51
|
-
from azure.search.documents.indexes import SearchIndexClient # type: ignore
|
|
52
|
-
from openai import AzureOpenAI # type: ignore
|
|
53
|
-
from azure.search.documents.indexes.models import (
|
|
54
|
-
SearchIndex,
|
|
55
|
-
SearchField,
|
|
56
|
-
SearchFieldDataType,
|
|
57
|
-
SimpleField,
|
|
58
|
-
SearchableField,
|
|
59
|
-
VectorSearch,
|
|
60
|
-
VectorSearchProfile,
|
|
61
|
-
HnswAlgorithmConfiguration
|
|
62
|
-
) # type: ignore
|
|
63
|
-
|
|
64
|
-
except Exception as e:
|
|
65
|
-
raise RuntimeError("Required packages missing. Install: azure-search-documents openai") from e
|
|
66
|
-
|
|
67
|
-
return (
|
|
68
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
def _setup_embedding_client(self):
|
|
72
|
-
if not self._embedding_client and self.config.get("vectorization"):
|
|
73
|
-
try:
|
|
74
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
75
|
-
self._embedding_client = AzureOpenAI(
|
|
76
|
-
api_version=self.config["embedding_api_version"],
|
|
77
|
-
azure_endpoint=self.config["embedding_endpoint"],
|
|
78
|
-
api_key=self.config["embedding_key"],
|
|
79
|
-
)
|
|
80
|
-
logger.info("Azure OpenAI embedding client initialized")
|
|
81
|
-
except Exception as ex:
|
|
82
|
-
logger.exception("Failed to initialize embedding client")
|
|
83
|
-
raise
|
|
84
|
-
|
|
85
|
-
def _get_embeddings(self, text: str) -> List[float]:
|
|
86
|
-
try:
|
|
87
|
-
self._setup_embedding_client()
|
|
88
|
-
response = self._embedding_client.embeddings.create(
|
|
89
|
-
model=self.config["embedding_deployment"],
|
|
90
|
-
input=text
|
|
91
|
-
)
|
|
92
|
-
return response.data[0].embedding
|
|
93
|
-
except Exception as ex:
|
|
94
|
-
logger.exception(f"Failed to get embeddings for text: {text[:100]}...")
|
|
95
|
-
raise
|
|
96
|
-
|
|
97
|
-
def _build_vector_search_config(self):
|
|
98
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
99
|
-
vector_config = self.config.get("vector_config", {})
|
|
100
|
-
dimensions = vector_config.get("dimensions", 1536)
|
|
101
|
-
|
|
102
|
-
vector_search = VectorSearch(
|
|
103
|
-
profiles=[VectorSearchProfile(name="vector-profile-1", algorithm_configuration_name="algorithms-config-1")],
|
|
104
|
-
algorithms=[HnswAlgorithmConfiguration(name="algorithms-config-1")]
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
return vector_search, dimensions
|
|
108
|
-
|
|
109
|
-
def _infer_field_type(self, value) -> Any:
|
|
110
|
-
#Map Python types to SearchFieldDataType, including collections
|
|
111
|
-
|
|
112
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
113
|
-
|
|
114
|
-
if value is None:
|
|
115
|
-
return SearchFieldDataType.String
|
|
116
|
-
|
|
117
|
-
t = type(value)
|
|
118
|
-
|
|
119
|
-
# Handle list/array types as Collections
|
|
120
|
-
if t in (list, tuple):
|
|
121
|
-
# If empty list, default to Collection of Double
|
|
122
|
-
if not value:
|
|
123
|
-
return SearchFieldDataType.Collection(SearchFieldDataType.Double)
|
|
124
|
-
# Get type of first element for non-empty lists
|
|
125
|
-
element_type = self._infer_field_type(value[0])
|
|
126
|
-
return SearchFieldDataType.Collection(element_type)
|
|
127
|
-
# Handle vector embeddings (list or tuple of floats)
|
|
128
|
-
if type(value) in (list, tuple) and all(isinstance(x, (int, float)) for x in value):
|
|
129
|
-
return SearchFieldDataType.Collection(SearchFieldDataType.Single)
|
|
130
|
-
|
|
131
|
-
# Handle basic types
|
|
132
|
-
logger.info(f"######## Infer field type for value:[ {value} ] of type [ {t} ]")
|
|
133
|
-
if t is bool:
|
|
134
|
-
return SearchFieldDataType.Boolean
|
|
135
|
-
if t is int:
|
|
136
|
-
return SearchFieldDataType.Int32
|
|
137
|
-
if t is float:
|
|
138
|
-
return SearchFieldDataType.Double
|
|
139
|
-
print(f"############## Infer field type for value: {value} of type {t}")
|
|
140
|
-
print(t is str)
|
|
141
|
-
if t is str:
|
|
142
|
-
return SearchFieldDataType.String
|
|
143
|
-
# fallback to string
|
|
144
|
-
logger.warning(f"Falling back to string type for value: {value} of type {t}")
|
|
145
|
-
return SearchFieldDataType.String
|
|
146
|
-
|
|
147
|
-
def _build_fields(self, sample: Dict[str, Any], key_field: str):
|
|
148
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
149
|
-
|
|
150
|
-
fields = []
|
|
151
|
-
# Add key field
|
|
152
|
-
if key_field not in sample:
|
|
153
|
-
fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
|
|
154
|
-
else:
|
|
155
|
-
fields.append(SimpleField(name=key_field, type=SearchFieldDataType.String, key=True))
|
|
156
|
-
|
|
157
|
-
# Add regular fields
|
|
158
|
-
for k, v in sample.items():
|
|
159
|
-
logger.info(f"================={k}============")
|
|
160
|
-
if k == key_field:
|
|
161
|
-
continue
|
|
162
|
-
logger.info(f"#### Infer field type for field: {k}")
|
|
163
|
-
typ = self._infer_field_type(v)
|
|
164
|
-
logger.info(f"#### Inferred type for field {k}: {typ}")
|
|
165
|
-
if typ == SearchFieldDataType.String:
|
|
166
|
-
fields.append(SearchableField(name=k, type=SearchFieldDataType.String))
|
|
167
|
-
else:
|
|
168
|
-
fields.append(SimpleField(name=k, type=typ))
|
|
169
|
-
|
|
170
|
-
# Add vector field if vectorization is enabled
|
|
171
|
-
if self.config.get("vectorization"):
|
|
172
|
-
vector_field = self.config.get("vector_field", "contentVector")
|
|
173
|
-
_, dimensions = self._build_vector_search_config()
|
|
174
|
-
fields.append(
|
|
175
|
-
SearchField(
|
|
176
|
-
name=vector_field,
|
|
177
|
-
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
|
|
178
|
-
searchable=True,
|
|
179
|
-
vector_search_dimensions=dimensions,
|
|
180
|
-
vector_search_profile_name="vector-profile-1"
|
|
181
|
-
)
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
return fields
|
|
185
|
-
|
|
186
|
-
def create_index(self, sample: Dict[str, Any]) -> bool:
|
|
187
|
-
try:
|
|
188
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
189
|
-
|
|
190
|
-
endpoint = self.config["aisearch_endpoint"]
|
|
191
|
-
api_key = self.config["aisearch_api_key"]
|
|
192
|
-
index_name = self.config["aisearch_index_name"]
|
|
193
|
-
key_field = self.config.get("key_field", "id")
|
|
194
|
-
|
|
195
|
-
index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
|
|
196
|
-
fields = self._build_fields(sample, key_field)
|
|
197
|
-
|
|
198
|
-
# Create index with vector search if enabled
|
|
199
|
-
if self.config.get("vectorization"):
|
|
200
|
-
vector_search, _ = self._build_vector_search_config()
|
|
201
|
-
index = SearchIndex(
|
|
202
|
-
name=index_name,
|
|
203
|
-
fields=fields,
|
|
204
|
-
vector_search=vector_search
|
|
205
|
-
)
|
|
206
|
-
else:
|
|
207
|
-
index = SearchIndex(name=index_name, fields=fields)
|
|
208
|
-
|
|
209
|
-
index_client.create_or_update_index(index)
|
|
210
|
-
logger.info(f"Azure Search index '{index_name}' created/updated with vectorization={self.config.get('vectorization', False)}")
|
|
211
|
-
return True
|
|
212
|
-
except Exception as ex:
|
|
213
|
-
logger.exception("AzureSearchIndexer.create_index failed")
|
|
214
|
-
return False
|
|
215
|
-
|
|
216
|
-
def upload_documents(self, docs: List[Dict[str, Any]]) -> bool:
|
|
217
|
-
try:
|
|
218
|
-
AzureKeyCredential, SearchClient, SearchIndexClient, AzureOpenAI, SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration = self._ensure_sdk()
|
|
219
|
-
endpoint = self.config["aisearch_endpoint"]
|
|
220
|
-
api_key = self.config["aisearch_api_key"]
|
|
221
|
-
index_name = self.config["aisearch_index_name"]
|
|
222
|
-
key_field = self.config.get("key_field", "id")
|
|
223
|
-
|
|
224
|
-
# Add IDs if missing
|
|
225
|
-
from uuid import uuid4
|
|
226
|
-
for d in docs:
|
|
227
|
-
if key_field not in d:
|
|
228
|
-
d[key_field] = str(uuid4())
|
|
229
|
-
elif not isinstance(d[key_field], str):
|
|
230
|
-
d[key_field] = str(d[key_field])
|
|
231
|
-
|
|
232
|
-
# Add vector embeddings if enabled
|
|
233
|
-
if self.config.get("vectorization"):
|
|
234
|
-
vector_field = self.config.get("vector_field", "contentVector")
|
|
235
|
-
content_field = self.config.get("content_field", "content")
|
|
236
|
-
|
|
237
|
-
for doc in docs:
|
|
238
|
-
if content_field in doc:
|
|
239
|
-
try:
|
|
240
|
-
embedding = self._get_embeddings(str(doc[content_field]))
|
|
241
|
-
doc[vector_field] = embedding
|
|
242
|
-
except Exception as e:
|
|
243
|
-
logger.error(f"Failed to get embedding for document {doc.get(key_field)}: {str(e)}")
|
|
244
|
-
continue
|
|
245
|
-
|
|
246
|
-
client = SearchClient(endpoint=endpoint, index_name=index_name,
|
|
247
|
-
credential=AzureKeyCredential(api_key))
|
|
248
|
-
|
|
249
|
-
logger.info(f"Uploading {len(docs)} documents to index {index_name}")
|
|
250
|
-
result = client.upload_documents(documents=docs)
|
|
251
|
-
|
|
252
|
-
failed = [r for r in result if not r.succeeded]
|
|
253
|
-
if failed:
|
|
254
|
-
logger.error(f"Some documents failed to upload: {failed}")
|
|
255
|
-
return False
|
|
256
|
-
|
|
257
|
-
logger.info("Documents uploaded successfully")
|
|
258
|
-
return True
|
|
259
|
-
|
|
260
|
-
except Exception:
|
|
261
|
-
logger.exception("AzureSearchIndexer.upload_documents failed")
|
|
262
|
-
return False
|
|
263
|
-
|
|
264
|
-
def index(self, rows: List[Dict[str, Any]]) -> bool:
|
|
265
|
-
"""High level: create index (based on first row) and upload all rows."""
|
|
266
|
-
if not rows:
|
|
267
|
-
logger.error("AzureSearchIndexer.index called with empty rows")
|
|
268
|
-
return False
|
|
269
|
-
|
|
270
|
-
try:
|
|
271
|
-
if not self.validate_config():
|
|
272
|
-
return False
|
|
273
|
-
|
|
274
|
-
sample = rows[0]
|
|
275
|
-
logger.info(f"Creating/updating index with sample: {sample}")
|
|
276
|
-
|
|
277
|
-
ok = self.create_index(sample)
|
|
278
|
-
if not ok:
|
|
279
|
-
return False
|
|
280
|
-
|
|
281
|
-
ok2 = self.upload_documents(rows)
|
|
282
|
-
return ok2
|
|
283
|
-
|
|
284
|
-
except Exception:
|
|
285
|
-
logger.exception("AzureSearchIndexer.index failed")
|
|
286
|
-
return False
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
datasourcelib/__init__.py,sha256=I7JTSZ1J6ULg_TfdMEgFcd1regkCHuyKdZT4DcPtoyQ,78
|
|
2
|
-
datasourcelib/core/__init__.py,sha256=nsXojDd97T7eMqqtCsZr1qSYLBitvKydSZRb9Dg7hqU,462
|
|
3
|
-
datasourcelib/core/sync_base.py,sha256=AfwwaV3rJOFKVmKKpSj-BwznnCDCaeuT4LLNDfA3NAY,716
|
|
4
|
-
datasourcelib/core/sync_manager.py,sha256=lj070S3PwSNcB0UL_ZDzDAm6uJ9G38TY491vQZ1dL3o,3849
|
|
5
|
-
datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
|
|
6
|
-
datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
|
|
7
|
-
datasourcelib/datasources/azure_devops_source.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
|
|
8
|
-
datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
|
|
9
|
-
datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
|
|
10
|
-
datasourcelib/datasources/datasource_types.py,sha256=eEiWymYS05X_TxwuB7P3MpphPG1En67h3kRiSGeHjQ0,176
|
|
11
|
-
datasourcelib/datasources/sharepoint_source - Copy.py,sha256=7V1c-zyvTo4IuPN_YMrKwLZFgbtipbP-mtunmXjOLJQ,17664
|
|
12
|
-
datasourcelib/datasources/sharepoint_source.py,sha256=Pv9735Gu2FylVeeT9e_cZlCvgGUwxn-pVRRZQe2PHU8,20196
|
|
13
|
-
datasourcelib/datasources/sql_source.py,sha256=sCYHrmeD82fQVcdQjL9Y2TTTjaqlv2v8B5noAng3Bl4,5450
|
|
14
|
-
datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6HhMyT0,94
|
|
15
|
-
datasourcelib/indexes/azure_search_index.py,sha256=o3BoSxURBk5jCC3AlNz-v9_igg-dXYS4yUxXZwSfqFg,17265
|
|
16
|
-
datasourcelib/indexes/azure_search_index_only.py,sha256=SulrYPehWGaf3Wi_Dw8UvFneSY-UwEK9viVYXwIlQuI,7120
|
|
17
|
-
datasourcelib/indexes/azure_search_index_vector.py,sha256=4By1vJHv1ORiWOpTqO5wR0sTrq1TaEHP6t8MoOINhok,13410
|
|
18
|
-
datasourcelib-0.1.2.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
|
|
19
|
-
datasourcelib-0.1.2.dist-info/METADATA,sha256=DOKGwf3XspFhCQRLYLod8Oqc2sUjhDaFFe15xiKqQhQ,1185
|
|
20
|
-
datasourcelib-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
-
datasourcelib-0.1.2.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
|
|
22
|
-
datasourcelib-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|