datasourcelib 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ from .daily_load import DailyLoadStrategy
2
+ from .full_load import FullLoadStrategy
3
+ from .incremental_load import IncrementalLoadStrategy
4
+ from .ondemand_load import OnDemandLoadStrategy
5
+ from .timerange_load import TimeRangeLoadStrategy
6
+
7
+
8
+ __all__ = [
9
+ "DailyLoadStrategy",
10
+ "FullLoadStrategy",
11
+ "IncrementalLoadStrategy",
12
+ "OnDemandLoadStrategy",
13
+ "TimeRangeLoadStrategy"
14
+ ]
@@ -0,0 +1,22 @@
1
+ from datasourcelib.core.sync_base import SyncBase
2
+ from datasourcelib.utils.logger import get_logger
3
+ from datetime import datetime, timedelta
4
+
5
+ logger = get_logger(__name__)
6
+
7
+ class DailyLoadStrategy(SyncBase):
8
+ """Daily scheduled load (wraps incremental)."""
9
+
10
+ def validate(self) -> bool:
11
+ return True
12
+
13
+ def sync(self, run_date: str = None, **kwargs) -> bool:
14
+ try:
15
+ run_date = run_date or datetime.utcnow().date().isoformat()
16
+ logger.info("Starting daily load for %s", run_date)
17
+ # Typically call incremental with last_sync = previous day midnight
18
+ # TODO implement scheduling integration externally; the strategy here is idempotent
19
+ return True
20
+ except Exception:
21
+ logger.exception("DailyLoadStrategy.sync failed")
22
+ return False
@@ -0,0 +1,38 @@
1
+ from datasourcelib.core.sync_base import SyncBase
2
+ from datasourcelib.utils.logger import get_logger
3
+ from datasourcelib.indexes.azure_search_index_vector import AzureSearchIndexer
4
+ logger = get_logger(__name__)
5
+
6
+ class FullLoadStrategy(SyncBase):
7
+ """Full load: replace or reload entire source into vector DB."""
8
+
9
+ def validate(self) -> bool:
10
+ # Minimal validation: required keys exist
11
+ dsok = self.data_source.validate_config()
12
+ return dsok
13
+
14
+ def sync(self, **kwargs) -> bool:
15
+ try:
16
+ logger.info("Running full data load")
17
+ data = self.data_source.fetch_data(**kwargs)
18
+ for key, value in kwargs.items():
19
+ print(f"{key} = {value}")
20
+ # Implement real extract -> transform -> load to vector DB
21
+ # Example pseudocode:
22
+ # vector_client.upsert_batch(self.vector_db_config, rows)
23
+ # New: use AzureSearchIndexer to create index and upload documents if requested
24
+ if isinstance(data, list) and data:
25
+ indexer = AzureSearchIndexer(self.vector_db_config or {})
26
+ if not indexer.validate_config():
27
+ logger.error("Vector DB config invalid for Azure Search indexer")
28
+ return False
29
+ ok = indexer.index(data)
30
+ if not ok:
31
+ logger.error("Indexing data to Azure Search failed")
32
+ return False
33
+
34
+ logger.info("Full data load finished successfully")
35
+ return True
36
+ except Exception:
37
+ logger.exception("FullLoadStrategy.sync failed")
38
+ return False
@@ -0,0 +1,27 @@
1
+ from datetime import datetime
2
+ from datasourcelib.core.sync_base import SyncBase
3
+ from datasourcelib.utils.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+ class IncrementalLoadStrategy(SyncBase):
8
+ """Incremental load using last_sync timestamp or cursor."""
9
+
10
+ def validate(self) -> bool:
11
+ # require source to support incremental field or cursor
12
+ if "cursor_field" not in self.source_config and "last_sync" not in self.source_config:
13
+ logger.error("IncrementalLoadStrategy missing cursor_field or last_sync in source_config")
14
+ return False
15
+ return True
16
+
17
+ def sync(self, last_sync: str = None, **kwargs) -> bool:
18
+ try:
19
+ last = last_sync or self.source_config.get("last_sync")
20
+ logger.info("Running incremental load since %s", last)
21
+ # TODO: fetch delta rows since 'last' and upsert to vector DB
22
+ # After successful run store new last_sync timestamp
23
+ logger.info("Incremental load completed")
24
+ return True
25
+ except Exception:
26
+ logger.exception("IncrementalLoadStrategy.sync failed")
27
+ return False
@@ -0,0 +1,19 @@
1
+ from datasourcelib.core.sync_base import SyncBase
2
+ from datasourcelib.utils.logger import get_logger
3
+
4
+ logger = get_logger(__name__)
5
+
6
+ class OnDemandLoadStrategy(SyncBase):
7
+ """On demand load triggered by user request (arbitrary params)."""
8
+
9
+ def validate(self) -> bool:
10
+ return True
11
+
12
+ def sync(self, **kwargs) -> bool:
13
+ try:
14
+ logger.info("On-demand sync invoked with params: %s", kwargs)
15
+ # Use kwargs to drive partial loads, filters, ids etc.
16
+ return True
17
+ except Exception:
18
+ logger.exception("OnDemandLoadStrategy.sync failed")
19
+ return False
@@ -0,0 +1,24 @@
1
+ from datetime import datetime
2
+ from datasourcelib.core.sync_base import SyncBase
3
+ from datasourcelib.utils.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+ class TimeRangeLoadStrategy(SyncBase):
8
+ """Load records between a start and end timestamp."""
9
+
10
+ def validate(self) -> bool:
11
+ # rely on params at runtime; minimal validation OK
12
+ return True
13
+
14
+ def sync(self, start: str = None, end: str = None, **kwargs) -> bool:
15
+ try:
16
+ if not start or not end:
17
+ logger.error("TimeRangeLoadStrategy requires 'start' and 'end'")
18
+ return False
19
+ logger.info("Time range load between %s and %s", start, end)
20
+ # TODO: query source for timeframe and upsert
21
+ return True
22
+ except Exception:
23
+ logger.exception("TimeRangeLoadStrategy.sync failed")
24
+ return False
@@ -0,0 +1,12 @@
1
+ from .byte_reader import ByteReader
2
+ from .exceptions import DatasourceLibError, SyncStrategyNotFound, DataSourceNotFound
3
+ from .file_reader import FileReader
4
+
5
+
6
+ __all__ = [
7
+ "ByteReader",
8
+ "FileReader",
9
+ "DatasourceLibError",
10
+ "SyncStrategyNotFound",
11
+ "SourceNotFound"
12
+ ]
@@ -0,0 +1,256 @@
1
+ from pathlib import Path
2
+ from typing import Optional, Union, List
3
+ import io
4
+ import pandas as pd
5
+
6
+ # --- Optional helpers ---
7
+ from charset_normalizer import from_bytes as cn_from_bytes
8
+
9
+ # DOCX
10
+ from docx import Document as DocxDocument
11
+
12
+ # PDF
13
+ import fitz # pymupdf
14
+ import pdfplumber
15
+
16
+ # PPTX
17
+ from pptx import Presentation
18
+
19
+ # YAML / XML
20
+ import yaml
21
+ from lxml import etree
22
+ import json
23
+
24
+
25
+ class ByteReader:
26
+ """
27
+ Unified reader for common file types.
28
+ - read_text(path): file path -> text
29
+ - read_table(path): file path -> DataFrame
30
+ - read_text_from_bytes(data, ext): bytes -> text
31
+ - read_table_from_bytes(data, ext): bytes -> DataFrame
32
+ """
33
+
34
+ TEXT_EXTS = {".txt", ".log", ".md"}
35
+ TABLE_EXTS = {".csv", ".tsv", ".xlsx", ".xls"}
36
+ DOCX_EXTS = {".docx"}
37
+ PDF_EXTS = {".pdf"}
38
+ PPTX_EXTS = {".pptx"}
39
+ JSON_EXTS = {".json"}
40
+ YAML_EXTS = {".yaml", ".yml"}
41
+ INI_EXTS = {".ini", ".cfg"}
42
+ XML_EXTS = {".xml"}
43
+
44
+ def __init__(self, default_encoding: str = "utf-8", errors: str = "replace"):
45
+ self.default_encoding = default_encoding
46
+ self.errors = errors
47
+
48
+ # -----------------------
49
+ # Public API (paths)
50
+ # -----------------------
51
+ def read_text(self, path: Union[str, Path]) -> str:
52
+ path = Path(path)
53
+ ext = path.suffix.lower()
54
+
55
+ if ext in self.TEXT_EXTS:
56
+ return path.read_text(encoding=self.default_encoding, errors=self.errors)
57
+
58
+ if ext in self.PDF_EXTS:
59
+ return self._read_pdf_text_path(path)
60
+
61
+ if ext in self.DOCX_EXTS:
62
+ return self._read_docx_text_fp(open(path, "rb"))
63
+
64
+ if ext in self.PPTX_EXTS:
65
+ return self._read_pptx_text_fp(open(path, "rb"))
66
+
67
+ if ext in self.JSON_EXTS:
68
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
69
+ obj = json.load(f)
70
+ return json.dumps(obj, indent=2, ensure_ascii=False)
71
+
72
+ if ext in self.YAML_EXTS:
73
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
74
+ obj = yaml.safe_load(f)
75
+ return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
76
+
77
+ if ext in self.INI_EXTS:
78
+ import configparser
79
+ parser = configparser.ConfigParser()
80
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
81
+ parser.read_file(f)
82
+ output = io.StringIO()
83
+ parser.write(output)
84
+ return output.getvalue()
85
+
86
+ if ext in self.XML_EXTS:
87
+ tree = etree.parse(str(path))
88
+ return etree.tostring(tree, pretty_print=True, encoding="unicode")
89
+
90
+ if ext in self.TABLE_EXTS:
91
+ df = self.read_table(path)
92
+ return df.to_csv(index=False)
93
+
94
+ raise ValueError(f"Unsupported file extension for text extraction: {ext}")
95
+
96
+ def read_table(self, path: Union[str, Path]) -> pd.DataFrame:
97
+ path = Path(path)
98
+ ext = path.suffix.lower()
99
+
100
+ if ext == ".csv":
101
+ return pd.read_csv(path)
102
+ if ext == ".tsv":
103
+ return pd.read_csv(path, sep="\t")
104
+ if ext == ".xlsx":
105
+ return pd.read_excel(path, engine="openpyxl")
106
+ if ext == ".xls":
107
+ return pd.read_excel(path, engine="xlrd")
108
+
109
+ # Fallback: attempt CSV read if unknown
110
+ try:
111
+ return pd.read_csv(path)
112
+ except Exception as e:
113
+ raise ValueError(f"Unsupported file extension for tables: {ext}") from e
114
+
115
+ # -----------------------
116
+ # Public API (bytes)
117
+ # -----------------------
118
+ def read_text_from_bytes(self, data: bytes, ext: str) -> str:
119
+ """
120
+ Extract text from in-memory bytes.
121
+ ext: file extension (e.g., '.pdf', '.docx', '.txt', '.pptx', '.json', '.yaml', '.xml', '.csv', '.xlsx')
122
+ """
123
+ ext = self._normalize_ext(ext)
124
+
125
+ if ext in self.TEXT_EXTS:
126
+ # Robust encoding detection
127
+ res = cn_from_bytes(data).best()
128
+ return str(res) if res else data.decode(self.default_encoding, errors=self.errors)
129
+
130
+ if ext in self.PDF_EXTS:
131
+ return self._read_pdf_text_bytes(data)
132
+
133
+ if ext in self.DOCX_EXTS:
134
+ return self._read_docx_text_fp(io.BytesIO(data))
135
+
136
+ if ext in self.PPTX_EXTS:
137
+ return self._read_pptx_text_fp(io.BytesIO(data))
138
+
139
+ if ext in self.JSON_EXTS:
140
+ obj = json.loads(data.decode(self.default_encoding, errors=self.errors))
141
+ return json.dumps(obj, indent=2, ensure_ascii=False)
142
+
143
+ if ext in self.YAML_EXTS:
144
+ obj = yaml.safe_load(data.decode(self.default_encoding, errors=self.errors))
145
+ return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
146
+
147
+ if ext in self.INI_EXTS:
148
+ import configparser
149
+ parser = configparser.ConfigParser()
150
+ parser.read_string(data.decode(self.default_encoding, errors=self.errors))
151
+ output = io.StringIO()
152
+ parser.write(output)
153
+ return output.getvalue()
154
+
155
+ if ext in self.XML_EXTS:
156
+ tree = etree.parse(io.BytesIO(data))
157
+ return etree.tostring(tree, pretty_print=True, encoding="unicode")
158
+
159
+ if ext in self.TABLE_EXTS:
160
+ df = self.read_table_from_bytes(data, ext)
161
+ return df.to_csv(index=False)
162
+
163
+ raise ValueError(f"Unsupported extension for text extraction from bytes: {ext}")
164
+
165
+ def read_table_from_bytes(self, data: bytes, ext: str) -> pd.DataFrame:
166
+ """
167
+ Load tabular data from in-memory bytes into a DataFrame.
168
+ """
169
+ ext = self._normalize_ext(ext)
170
+
171
+ if ext == ".csv":
172
+ return pd.read_csv(io.BytesIO(data))
173
+ if ext == ".tsv":
174
+ return pd.read_csv(io.BytesIO(data), sep="\t")
175
+ if ext == ".xlsx":
176
+ return pd.read_excel(io.BytesIO(data), engine="openpyxl")
177
+ if ext == ".xls":
178
+ return pd.read_excel(io.BytesIO(data), engine="xlrd")
179
+
180
+ # Opportunistic fallback: try CSV
181
+ try:
182
+ return pd.read_csv(io.BytesIO(data))
183
+ except Exception as e:
184
+ raise ValueError(f"Unsupported extension for table reading from bytes: {ext}") from e
185
+
186
+ # -----------------------
187
+ # Internal helpers
188
+ # -----------------------
189
+ def _normalize_ext(self, ext: str) -> str:
190
+ ext = (ext or "").strip().lower()
191
+ if not ext.startswith("."):
192
+ ext = "." + ext
193
+ return ext
194
+
195
+ def _read_pdf_text_path(self, path: Path) -> str:
196
+ # Prefer PyMuPDF
197
+ try:
198
+ parts: List[str] = []
199
+ with fitz.open(str(path)) as doc:
200
+ if doc.is_encrypted and not doc.authenticate(""):
201
+ raise RuntimeError("Encrypted PDF requires a password.")
202
+ for page in doc:
203
+ parts.append(page.get_text("text"))
204
+ text = "\n\n".join(parts).strip()
205
+ if text:
206
+ return text
207
+ except Exception:
208
+ pass
209
+
210
+ # Fallback: pdfplumber
211
+ with pdfplumber.open(str(path)) as pdf:
212
+ return "\n\n".join([(p.extract_text() or "") for p in pdf.pages]).strip()
213
+
214
+ def _read_pdf_text_bytes(self, data: bytes) -> str:
215
+ # PyMuPDF can open from bytes
216
+ try:
217
+ doc = fitz.open(stream=data, filetype="pdf")
218
+ parts: List[str] = []
219
+ if doc.is_encrypted and not doc.authenticate(""):
220
+ raise RuntimeError("Encrypted PDF requires a password.")
221
+ for page in doc:
222
+ parts.append(page.get_text("text"))
223
+ doc.close()
224
+ text = "\n\n".join(parts).strip()
225
+ if text:
226
+ return text
227
+ except Exception:
228
+ pass
229
+
230
+ # Fallback to pdfplumber from BytesIO
231
+ with pdfplumber.open(io.BytesIO(data)) as pdf:
232
+ return "\n\n".join([(p.extract_text() or "") for p in pdf.pages]).strip()
233
+
234
+ def _read_docx_text_fp(self, fp) -> str:
235
+ doc = DocxDocument(fp)
236
+ chunks = []
237
+ for p in doc.paragraphs:
238
+ if p.text:
239
+ chunks.append(p.text)
240
+ for table in doc.tables:
241
+ for row in table.rows:
242
+ cells = [cell.text.strip() for cell in row.cells]
243
+ if any(cells):
244
+ chunks.append("\t".join(cells))
245
+ return "\n".join(chunks).strip()
246
+
247
+ def _read_pptx_text_fp(self, fp) -> str:
248
+ prs = Presentation(fp)
249
+ chunks = []
250
+ for slide in prs.slides:
251
+ for shape in slide.shapes:
252
+ if hasattr(shape, "has_text_frame") and shape.has_text_frame:
253
+ text = shape.text or ""
254
+ if text:
255
+ chunks.append(text)
256
+ return "\n".join(chunks).strip()
@@ -0,0 +1,9 @@
1
+ class DatasourceLibError(Exception):
2
+ """Base exception for datasourcelib."""
3
+
4
+ class SyncStrategyNotFound(DatasourceLibError):
5
+ """Raised when a strategy is not found."""
6
+
7
+ # Added: DataSourceNotFound to represent missing/unknown data sources
8
+ class DataSourceNotFound(DatasourceLibError):
9
+ """Raised when a data source is not found or not registered."""
@@ -0,0 +1,217 @@
1
+ from pathlib import Path
2
+ from typing import Optional, Union, List
3
+ import io
4
+ import pandas as pd
5
+
6
+ # --- Optional helpers ---
7
+ from charset_normalizer import from_path as cn_from_path
8
+
9
+ # DOCX
10
+ from docx import Document as DocxDocument
11
+
12
+ # PDF
13
+ import fitz # pymupdf
14
+ import pdfplumber
15
+
16
+ # PPTX
17
+ from pptx import Presentation
18
+
19
+ # YAML / XML
20
+ import yaml
21
+ from lxml import etree
22
+
23
+
24
+ class FileReader:
25
+ """
26
+ A unified reader for common file types.
27
+ - read_text(path): extract text from txt, pdf, docx, pptx, json, yaml, ini, xml
28
+ - read_table(path): load tabular data from csv, tsv, xlsx/xls
29
+ """
30
+
31
+ TEXT_EXTS = {".txt", ".log", ".md"}
32
+ TABLE_EXTS = {".csv", ".tsv", ".xlsx", ".xls"}
33
+ DOCX_EXTS = {".docx"}
34
+ PDF_EXTS = {".pdf"}
35
+ PPTX_EXTS = {".pptx"}
36
+ JSON_EXTS = {".json"}
37
+ YAML_EXTS = {".yaml", ".yml"}
38
+ INI_EXTS = {".ini", ".cfg"}
39
+ XML_EXTS = {".xml"}
40
+
41
+ def __init__(self, default_encoding: str = "utf-8", errors: str = "replace"):
42
+ self.default_encoding = default_encoding
43
+ self.errors = errors
44
+
45
+ # -----------------------
46
+ # Public API
47
+ # -----------------------
48
+ def read_text(self, path: Union[str, Path]) -> str:
49
+ """
50
+ Extract best-effort text from a given file based on extension.
51
+ """
52
+ path = Path(path)
53
+ ext = path.suffix.lower()
54
+
55
+ if ext in self.TEXT_EXTS:
56
+ return self._read_plain_text(path)
57
+
58
+ if ext in self.PDF_EXTS:
59
+ return self._read_pdf_text(path)
60
+
61
+ if ext in self.DOCX_EXTS:
62
+ return self._read_docx_text(path)
63
+
64
+ if ext in self.PPTX_EXTS:
65
+ return self._read_pptx_text(path)
66
+
67
+ if ext in self.JSON_EXTS:
68
+ return self._read_json_text(path)
69
+
70
+ if ext in self.YAML_EXTS:
71
+ return self._read_yaml_text(path)
72
+
73
+ if ext in self.INI_EXTS:
74
+ return self._read_ini_text(path)
75
+
76
+ if ext in self.XML_EXTS:
77
+ return self._read_xml_text(path)
78
+
79
+ if ext in self.TABLE_EXTS:
80
+ # For tabular files, provide a quick text representation
81
+ df = self.read_table(path)
82
+ return df.to_csv(index=False)
83
+
84
+ raise ValueError(f"Unsupported file extension for text extraction: {ext}")
85
+
86
+ def read_table(self, path: Union[str, Path]) -> pd.DataFrame:
87
+ """
88
+ Load tabular data from CSV/TSV/Excel, returning a DataFrame.
89
+ """
90
+ path = Path(path)
91
+ ext = path.suffix.lower()
92
+
93
+ if ext == ".csv":
94
+ return pd.read_csv(path)
95
+ if ext == ".tsv":
96
+ return pd.read_csv(path, sep="\t")
97
+ if ext == ".xlsx":
98
+ return pd.read_excel(path, engine="openpyxl")
99
+ if ext == ".xls":
100
+ return pd.read_excel(path, engine="xlrd")
101
+
102
+ # Fallback: attempt CSV read if unknown
103
+ try:
104
+ return pd.read_csv(path)
105
+ except Exception as e:
106
+ raise ValueError(f"Unsupported file extension for tables: {ext}") from e
107
+
108
+ # -----------------------
109
+ # Text readers
110
+ # -----------------------
111
+ def _read_plain_text(self, path: Path) -> str:
112
+ # Detect encoding for robustness
113
+ res = cn_from_path(str(path)).best()
114
+ if res:
115
+ return str(res)
116
+ # Fallback to configured defaults
117
+ return path.read_text(encoding=self.default_encoding, errors=self.errors)
118
+
119
+ def _read_pdf_text(self, path: Path) -> str:
120
+ # Try PyMuPDF (fast, layout-aware)
121
+ try:
122
+ text_parts: List[str] = []
123
+ with fitz.open(str(path)) as doc:
124
+ if doc.can_save and doc.is_encrypted:
125
+ # If encrypted and requires a password, this will fail to extract text.
126
+ if not doc.authenticate(""):
127
+ raise RuntimeError("Encrypted PDF requires a password.")
128
+ for page in doc:
129
+ text_parts.append(page.get_text("text"))
130
+ text = "\n".join(text_parts).strip()
131
+ if text:
132
+ return text
133
+ except Exception:
134
+ pass
135
+
136
+ # Fallback to pdfplumber (good for tables/structured text)
137
+ try:
138
+ text_parts = []
139
+ with pdfplumber.open(str(path)) as pdf:
140
+ for page in pdf.pages:
141
+ t = page.extract_text() or ""
142
+ text_parts.append(t)
143
+ return "\n".join(text_parts).strip()
144
+ except Exception as e:
145
+ raise RuntimeError(f"Failed to read PDF: {e}") from e
146
+
147
+ def _read_docx_text(self, path: Path) -> str:
148
+ doc = DocxDocument(str(path))
149
+ chunks = []
150
+ # Paragraphs
151
+ for p in doc.paragraphs:
152
+ if p.text:
153
+ chunks.append(p.text)
154
+ # Tables (optional: include)
155
+ for table in doc.tables:
156
+ for row in table.rows:
157
+ cells = [cell.text.strip() for cell in row.cells]
158
+ if any(cells):
159
+ chunks.append("\t".join(cells))
160
+ return "\n".join(chunks).strip()
161
+
162
+ def _read_pptx_text(self, path: Path) -> str:
163
+ prs = Presentation(str(path))
164
+ chunks = []
165
+ for slide in prs.slides:
166
+ for shape in slide.shapes:
167
+ if hasattr(shape, "text") and shape.has_text_frame:
168
+ text = shape.text if hasattr(shape, "text") else ""
169
+ if text:
170
+ chunks.append(text)
171
+ return "\n".join(chunks).strip()
172
+
173
+ def _read_json_text(self, path: Path) -> str:
174
+ import json
175
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
176
+ obj = json.load(f)
177
+ # Pretty-print
178
+ return json.dumps(obj, indent=2, ensure_ascii=False)
179
+
180
+ def _read_yaml_text(self, path: Path) -> str:
181
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
182
+ obj = yaml.safe_load(f)
183
+ return yaml.safe_dump(obj, sort_keys=False, allow_unicode=True)
184
+
185
+ def _read_ini_text(self, path: Path) -> str:
186
+ import configparser
187
+ parser = configparser.ConfigParser()
188
+ with path.open("r", encoding=self.default_encoding, errors=self.errors) as f:
189
+ # INI files might have duplicate keys; defaults handle many cases
190
+ parser.read_file(f)
191
+ output = io.StringIO()
192
+ parser.write(output)
193
+ return output.getvalue()
194
+
195
+ def _read_xml_text(self, path: Path) -> str:
196
+ # Pretty-print XML
197
+ tree = etree.parse(str(path))
198
+ return etree.tostring(tree, pretty_print=True, encoding="unicode")
199
+
200
+
201
+ # -----------------------
202
+ # Example usage
203
+ # -----------------------
204
+ #if __name__ == "__main__":
205
+ # reader = FileReader()
206
+
207
+ # 1) Extract text
208
+ # print(reader.read_text("document.pdf"))
209
+ # print(reader.read_text("report.docx"))
210
+ # print(reader.read_text("slides.pptx"))
211
+ # print(reader.read_text("notes.txt"))
212
+ # print(reader.read_text("config.yaml"))
213
+ # print(reader.read_text("data.xml"))
214
+
215
+ # 2) Load tabular data
216
+ # df = reader.read_table("data.xlsx")
217
+ # print(df.head())
@@ -0,0 +1,12 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ def get_logger(name: Optional[str] = None) -> logging.Logger:
5
+ logger = logging.getLogger(name or __name__)
6
+ if not logger.handlers:
7
+ h = logging.StreamHandler()
8
+ fmt = "%(asctime)s %(levelname)s %(name)s %(message)s"
9
+ h.setFormatter(logging.Formatter(fmt))
10
+ logger.addHandler(h)
11
+ logger.setLevel(logging.INFO)
12
+ return logger
@@ -0,0 +1,7 @@
1
+ from typing import Dict
2
+
3
+ def require_keys(cfg: Dict, keys):
4
+ missing = [k for k in keys if k not in cfg]
5
+ if missing:
6
+ raise KeyError(f"Missing required keys: {missing}")
7
+ return True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Data source sync strategies for vector DBs
5
5
  Home-page: https://github.com/jaiprakash0217/datasourcelib
6
6
  Author: Jai Prakash
@@ -15,8 +15,20 @@ datasourcelib/indexes/__init__.py,sha256=S8dz-lyxy1BTuDuLGRJNLrZD_1ku_FIUnDEm6Hh
15
15
  datasourcelib/indexes/azure_search_index.py,sha256=o3BoSxURBk5jCC3AlNz-v9_igg-dXYS4yUxXZwSfqFg,17265
16
16
  datasourcelib/indexes/azure_search_index_only.py,sha256=SulrYPehWGaf3Wi_Dw8UvFneSY-UwEK9viVYXwIlQuI,7120
17
17
  datasourcelib/indexes/azure_search_index_vector.py,sha256=4By1vJHv1ORiWOpTqO5wR0sTrq1TaEHP6t8MoOINhok,13410
18
- datasourcelib-0.1.2.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
19
- datasourcelib-0.1.2.dist-info/METADATA,sha256=DOKGwf3XspFhCQRLYLod8Oqc2sUjhDaFFe15xiKqQhQ,1185
20
- datasourcelib-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
21
- datasourcelib-0.1.2.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
22
- datasourcelib-0.1.2.dist-info/RECORD,,
18
+ datasourcelib/strategies/__init__.py,sha256=kot3u62KIAqYBg9M-KRE4mkMII_zwrDBZNf8Dj1vmX8,399
19
+ datasourcelib/strategies/daily_load.py,sha256=Rh-veUhxKYsplwHTyko_Zp9C6NkUJV5VAGtg-p7Iy34,856
20
+ datasourcelib/strategies/full_load.py,sha256=zqDZZcmyJKXQ4v3coq5njjadlBNI9V8f_lfXVZCoLbQ,1698
21
+ datasourcelib/strategies/incremental_load.py,sha256=TVqmDLu3m571nqGvzo_69i36QtYe4sBpllFwfPNL0TE,1178
22
+ datasourcelib/strategies/ondemand_load.py,sha256=VxzAYgrW2ebTOC3xm61CerL2AFehZUJLnKrqtGRGJoE,644
23
+ datasourcelib/strategies/timerange_load.py,sha256=c62BN2yXwVFaA_dQV54qenP4vrb4rcFqbx6m-nqhaTA,900
24
+ datasourcelib/utils/__init__.py,sha256=9pSIpaK-kdmNuDzwl0Z7QU-_lV3cZE-iwOEPh3RBBTs,298
25
+ datasourcelib/utils/byte_reader.py,sha256=GaoPXwJa2YTWG1Kim0K6JG20eVSaWkZJd1o9bswxHmc,9082
26
+ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4fYQww,369
27
+ datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
28
+ datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
29
+ datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
30
+ datasourcelib-0.1.3.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
31
+ datasourcelib-0.1.3.dist-info/METADATA,sha256=cPVrPEkPN22sTYOoO20byXcpu5hvKVQIPu3elgyyEko,1185
32
+ datasourcelib-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ datasourcelib-0.1.3.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
34
+ datasourcelib-0.1.3.dist-info/RECORD,,