waveflowdb-client 0.0.1__tar.gz → 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/PKG-INFO +3 -3
- {waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/pyproject.toml +2 -2
- {waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/readme.md +1 -1
- waveflowdb_client-0.0.2/waveflowdb_client/__init__.py +18 -0
- waveflowdb_client-0.0.2/waveflowdb_client/client.py +297 -0
- waveflowdb_client-0.0.2/waveflowdb_client/config.py +56 -0
- waveflowdb_client-0.0.2/waveflowdb_client/exceptions.py +11 -0
- waveflowdb_client-0.0.2/waveflowdb_client/models.py +44 -0
- waveflowdb_client-0.0.2/waveflowdb_client/utils.py +149 -0
- {waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/waveflowdb_client.egg-info/PKG-INFO +3 -3
- waveflowdb_client-0.0.2/waveflowdb_client.egg-info/SOURCES.txt +14 -0
- waveflowdb_client-0.0.2/waveflowdb_client.egg-info/top_level.txt +1 -0
- waveflowdb_client-0.0.1/starter.py +0 -195
- waveflowdb_client-0.0.1/waveflowdb_client.egg-info/SOURCES.txt +0 -9
- waveflowdb_client-0.0.1/waveflowdb_client.egg-info/top_level.txt +0 -1
- {waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/LICENSE +0 -0
- {waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/setup.cfg +0 -0
- {waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/waveflowdb_client.egg-info/dependency_links.txt +0 -0
- {waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/waveflowdb_client.egg-info/requires.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
|
-
Name:
|
|
3
|
-
Version: 0.0.
|
|
2
|
+
Name: waveflowdb_client
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
|
|
5
5
|
Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
|
|
6
6
|
License: MIT License
|
|
@@ -73,7 +73,7 @@ to interact with the Vector Lake API.
|
|
|
73
73
|
### 1. Install Dependencies
|
|
74
74
|
|
|
75
75
|
``` bash
|
|
76
|
-
pip install
|
|
76
|
+
pip install waveflowdb_client
|
|
77
77
|
```
|
|
78
78
|
|
|
79
79
|
### 2. Configure API Credentials
|
|
@@ -3,8 +3,8 @@ requires = ["setuptools>=61.0", "wheel"]
|
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
|
-
name = "
|
|
7
|
-
version = "0.0.
|
|
6
|
+
name = "waveflowdb_client" # pip install name
|
|
7
|
+
version = "0.0.2"
|
|
8
8
|
description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
|
|
9
9
|
readme = "readme.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
|
2
|
+
|
|
3
|
+
from .client import VectorLakeClient
|
|
4
|
+
from .config import Config
|
|
5
|
+
from .exceptions import APIError, FileProcessingError
|
|
6
|
+
from .models import DocumentInfo, ChatResponse, MatchingDocsResponse, HealthResponse, BatchResult
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"VectorLakeClient",
|
|
10
|
+
"Config",
|
|
11
|
+
"APIError",
|
|
12
|
+
"FileProcessingError",
|
|
13
|
+
"DocumentInfo",
|
|
14
|
+
"ChatResponse",
|
|
15
|
+
"MatchingDocsResponse",
|
|
16
|
+
"HealthResponse",
|
|
17
|
+
"BatchResult",
|
|
18
|
+
]
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import logging
|
|
3
|
+
import json
|
|
4
|
+
import requests
|
|
5
|
+
import os
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from typing import List, Optional, Dict, Any
|
|
8
|
+
|
|
9
|
+
from .config import Config
|
|
10
|
+
from .utils import FileProcessor, Logger, BatchManager
|
|
11
|
+
from .exceptions import APIError
|
|
12
|
+
from .models import ChatResponse, MatchingDocsResponse, HealthResponse, BatchResult
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
logging.basicConfig(level=logging.INFO)
|
|
16
|
+
|
|
17
|
+
class VectorLakeClient:
|
|
18
|
+
def __init__(self, config: Optional[Config] = None, **kwargs):
|
|
19
|
+
if config is None:
|
|
20
|
+
config = Config(**kwargs)
|
|
21
|
+
logging.info(f"Initializing VectorLakeClient with base_url={config.base_url_query}")
|
|
22
|
+
self.config = config
|
|
23
|
+
self.logger = Logger(config.log_dir)
|
|
24
|
+
self.batch_manager = BatchManager(config.max_files_per_batch, config.max_batch_size_mb)
|
|
25
|
+
self.file_processor = FileProcessor()
|
|
26
|
+
self.perf_csv = "performance_logs.csv"
|
|
27
|
+
|
|
28
|
+
def _get_headers(self) -> Dict[str, str]:
|
|
29
|
+
return {
|
|
30
|
+
'Content-Type': 'application/json',
|
|
31
|
+
'x-api-key': self.config.api_key
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
def _make_request(self, endpoint: str, payload: Dict[str, Any], operation: str = "", batch_num: int = 0) -> Dict[str, Any]:
|
|
35
|
+
headers = self._get_headers()
|
|
36
|
+
request_size = len(json.dumps(payload).encode('utf-8')) / 1024 if payload is not None else 0
|
|
37
|
+
for attempt in range(self.config.max_retries):
|
|
38
|
+
try:
|
|
39
|
+
start_time = time.time()
|
|
40
|
+
response = requests.post(endpoint, json=payload, headers=headers, timeout=self.config.timeout)
|
|
41
|
+
latency = (time.time() - start_time) * 1000
|
|
42
|
+
try:
|
|
43
|
+
result = response.json()
|
|
44
|
+
except Exception:
|
|
45
|
+
result = {"status_code": response.status_code, "text": response.text}
|
|
46
|
+
|
|
47
|
+
if operation:
|
|
48
|
+
response_size = len(response.content) / 1024 if response.content is not None else 0
|
|
49
|
+
result_count = len(result.get("results", [])) if isinstance(result, dict) else "N/A"
|
|
50
|
+
self.logger.log_performance(operation, batch_num, latency, request_size, response_size, result_count)
|
|
51
|
+
|
|
52
|
+
if response.status_code >= 400:
|
|
53
|
+
raise APIError(result.get('message', f'HTTP {response.status_code}'), status_code=response.status_code, response_text=response.text)
|
|
54
|
+
|
|
55
|
+
return result
|
|
56
|
+
except requests.exceptions.RequestException as e:
|
|
57
|
+
if attempt == self.config.max_retries - 1:
|
|
58
|
+
error_msg = f"Request failed after {self.config.max_retries} attempts: {str(e)}"
|
|
59
|
+
if operation:
|
|
60
|
+
self.logger.log_api_error(operation, batch_num, error_msg)
|
|
61
|
+
raise APIError(error_msg, getattr(e.response, 'status_code', None), getattr(e.response, 'text', None))
|
|
62
|
+
time.sleep(2 ** attempt)
|
|
63
|
+
|
|
64
|
+
def _read_files(self, filenames: List[str]) -> List[str]:
|
|
65
|
+
contents = []
|
|
66
|
+
for filename in filenames:
|
|
67
|
+
filepath = os.path.join(self.config.vector_lake_path, filename)
|
|
68
|
+
try:
|
|
69
|
+
if self.file_processor.is_supported_file(filename):
|
|
70
|
+
content = self.file_processor.read_file_content(filepath)
|
|
71
|
+
contents.append(content)
|
|
72
|
+
else:
|
|
73
|
+
self.logger.log_skipped_file(filename, "Unsupported file type")
|
|
74
|
+
contents.append("")
|
|
75
|
+
except Exception as e:
|
|
76
|
+
self.logger.log_skipped_file(filename, f"Read error: {str(e)}")
|
|
77
|
+
contents.append("")
|
|
78
|
+
return contents
|
|
79
|
+
|
|
80
|
+
def chat_with_docs(self,
|
|
81
|
+
query: str,
|
|
82
|
+
user_id: str,
|
|
83
|
+
vector_lake_description: str,
|
|
84
|
+
pattern: str = "static",
|
|
85
|
+
session_id: Optional[str] = None,
|
|
86
|
+
hybrid_filter: bool = False,
|
|
87
|
+
top_docs: int = 3,
|
|
88
|
+
threshold: float = 0.2,
|
|
89
|
+
files: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
90
|
+
endpoint = self.config.endpoints["chat_with_docs"]
|
|
91
|
+
payload = {
|
|
92
|
+
"session_id": session_id,
|
|
93
|
+
"user_id": user_id,
|
|
94
|
+
"vector_lake_description": vector_lake_description,
|
|
95
|
+
"query": query,
|
|
96
|
+
"hybrid_filter": hybrid_filter,
|
|
97
|
+
"top_docs": top_docs,
|
|
98
|
+
"threshold": threshold,
|
|
99
|
+
"pattern": pattern
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if pattern == "dynamic" and files:
|
|
103
|
+
file_contents = self._read_files(files)
|
|
104
|
+
payload.update({
|
|
105
|
+
"files_name": files,
|
|
106
|
+
"files_data": file_contents
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
result = self._make_request(endpoint, payload, "chat_with_docs")
|
|
111
|
+
return result
|
|
112
|
+
except Exception as e:
|
|
113
|
+
return ChatResponse(response=f"Error: {e}", query=query, session_id=session_id or "", user_id=user_id, timestamp=time.time())
|
|
114
|
+
|
|
115
|
+
def get_matching_docs(self,
|
|
116
|
+
query: str,
|
|
117
|
+
user_id: str,
|
|
118
|
+
vector_lake_description: str,
|
|
119
|
+
pattern: str = "static",
|
|
120
|
+
session_id: Optional[str] = None,
|
|
121
|
+
hybrid_filter: bool = False,
|
|
122
|
+
top_docs: int = 10,
|
|
123
|
+
threshold: float = 0.2,
|
|
124
|
+
files: Optional[List[str]] = None,
|
|
125
|
+
with_data: bool = False) -> Dict[str, Any]:
|
|
126
|
+
endpoint_key = "top_matching_docs_with_data" if with_data else "top_matching_docs"
|
|
127
|
+
endpoint = self.config.endpoints[endpoint_key]
|
|
128
|
+
payload = {
|
|
129
|
+
"session_id": session_id,
|
|
130
|
+
"user_id": user_id,
|
|
131
|
+
"vector_lake_description": vector_lake_description,
|
|
132
|
+
"query": query,
|
|
133
|
+
"hybrid_filter": hybrid_filter,
|
|
134
|
+
"top_docs": top_docs,
|
|
135
|
+
"threshold": threshold,
|
|
136
|
+
"pattern": pattern
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if pattern == "dynamic" and files:
|
|
140
|
+
file_contents = self._read_files(files)
|
|
141
|
+
payload.update({
|
|
142
|
+
"files_name": files,
|
|
143
|
+
"files_data": file_contents
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
raw_result = self._make_request(endpoint, payload, endpoint_key)
|
|
148
|
+
return raw_result
|
|
149
|
+
except Exception as e:
|
|
150
|
+
raise
|
|
151
|
+
|
|
152
|
+
def add_documents(self,
|
|
153
|
+
user_id: str,
|
|
154
|
+
vector_lake_description: str,
|
|
155
|
+
start_from_batch=1,
|
|
156
|
+
intelligent_segmentation: bool = True,
|
|
157
|
+
session_id: Optional[str] = None,
|
|
158
|
+
files: Optional[List[str]] = None,
|
|
159
|
+
files_name: Optional[List[str]] = None,
|
|
160
|
+
files_data: Optional[List[str]] = None,
|
|
161
|
+
max_workers=5) -> List[BatchResult]:
|
|
162
|
+
# If user supplies file names and data directly, bypass batching
|
|
163
|
+
if files_name and files_data:
|
|
164
|
+
if len(files_name) != len(files_data):
|
|
165
|
+
raise ValueError("files_name and files_data must be same length")
|
|
166
|
+
payload = {
|
|
167
|
+
"session_id": session_id,
|
|
168
|
+
"user_id": user_id,
|
|
169
|
+
"vector_lake_description": vector_lake_description,
|
|
170
|
+
"files_name": files_name,
|
|
171
|
+
"files_data": files_data,
|
|
172
|
+
"intelligent_segmentation": intelligent_segmentation
|
|
173
|
+
}
|
|
174
|
+
endpoint = self.config.endpoints["add_docs"]
|
|
175
|
+
result = self._make_request(endpoint, payload, "add_docs", batch_num=1)
|
|
176
|
+
return [BatchResult(batch_number=1, response=result, files_processed=files_name, success=True)]
|
|
177
|
+
|
|
178
|
+
return self._process_files_in_batches(
|
|
179
|
+
"add_docs", user_id, vector_lake_description, start_from_batch, intelligent_segmentation, session_id, files, max_workers=max_workers
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def refresh_documents(self,
|
|
183
|
+
user_id: str,
|
|
184
|
+
vector_lake_description: str,
|
|
185
|
+
intelligent_segmentation: bool = True,
|
|
186
|
+
session_id: Optional[str] = None,
|
|
187
|
+
files: Optional[List[str]] = None,
|
|
188
|
+
files_name: Optional[List[str]] = None,
|
|
189
|
+
files_data: Optional[List[str]] = None) -> List[BatchResult]:
|
|
190
|
+
# If user supplies file names and data directly, bypass batching
|
|
191
|
+
if files_name and files_data:
|
|
192
|
+
if len(files_name) != len(files_data):
|
|
193
|
+
raise ValueError("files_name and files_data must be same length")
|
|
194
|
+
payload = {
|
|
195
|
+
"session_id": session_id,
|
|
196
|
+
"user_id": user_id,
|
|
197
|
+
"vector_lake_description": vector_lake_description,
|
|
198
|
+
"files_name": files_name,
|
|
199
|
+
"files_data": files_data,
|
|
200
|
+
"intelligent_segmentation": intelligent_segmentation
|
|
201
|
+
}
|
|
202
|
+
endpoint = self.config.endpoints["refresh_docs"]
|
|
203
|
+
result = self._make_request(endpoint, payload, "refresh_docs", batch_num=1)
|
|
204
|
+
return [BatchResult(batch_number=1, response=result, files_processed=files_name, success=True)]
|
|
205
|
+
|
|
206
|
+
return self._process_files_in_batches(
|
|
207
|
+
"refresh_docs", user_id, vector_lake_description, 1, intelligent_segmentation, session_id, files
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def health_check(self, user_id: str, vector_lake_description: str, session_id: Optional[str] = None) -> Dict[str, Any]:
|
|
211
|
+
endpoint = self.config.endpoints["health"]
|
|
212
|
+
payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description}
|
|
213
|
+
try:
|
|
214
|
+
result = self._make_request(endpoint, payload, "health")
|
|
215
|
+
return HealthResponse(status="success", message=result.get("message", "ok"), timestamp=time.time(), details=result)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
return HealthResponse(status="error", message=str(e), timestamp=time.time())
|
|
218
|
+
|
|
219
|
+
def get_namespace_details(self, user_id: str, session_id: Optional[str] = None, vector_lake_description: Optional[str] = None) -> Dict[str, Any]:
|
|
220
|
+
endpoint = self.config.endpoints["get_namespace_details_by_userid"]
|
|
221
|
+
payload = {"session_id": session_id, "user_id": user_id}
|
|
222
|
+
if vector_lake_description:
|
|
223
|
+
payload["vector_lake_description"] = vector_lake_description
|
|
224
|
+
try:
|
|
225
|
+
result = self._make_request(endpoint, payload, "get_namespace_details")
|
|
226
|
+
return result
|
|
227
|
+
except Exception as e:
|
|
228
|
+
return {"status": "error", "message": str(e)}
|
|
229
|
+
|
|
230
|
+
def get_docs_information(self, user_id: str, vector_lake_description: str, session_id: Optional[str] = None, keyword: Optional[str] = None, threshold: int = 70) -> Dict[str, Any]:
|
|
231
|
+
endpoint = self.config.endpoints["get_docs_information"]
|
|
232
|
+
payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description, "threshold": threshold}
|
|
233
|
+
if keyword:
|
|
234
|
+
payload["keyword"] = keyword
|
|
235
|
+
try:
|
|
236
|
+
result = self._make_request(endpoint, payload, "get_docs_information")
|
|
237
|
+
return result
|
|
238
|
+
except Exception as e:
|
|
239
|
+
return {"status": "error", "message": str(e)}
|
|
240
|
+
|
|
241
|
+
def full_corpus_search(self, user_id: str, vector_lake_description: str, keyword: str, session_id: Optional[str] = None, top_docs: int = 10) -> Dict[str, Any]:
|
|
242
|
+
endpoint = self.config.endpoints["full_corpus_search"]
|
|
243
|
+
payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description, "keyword": keyword, "top_docs": top_docs}
|
|
244
|
+
try:
|
|
245
|
+
result = self._make_request(endpoint, payload, "full_corpus_search")
|
|
246
|
+
return result
|
|
247
|
+
except Exception as e:
|
|
248
|
+
return {"status": "error", "message": str(e)}
|
|
249
|
+
|
|
250
|
+
def _make_request_with_backoff(self, endpoint, payload, operation, batch_num, retries=5, base_delay=1):
|
|
251
|
+
delay = base_delay
|
|
252
|
+
for attempt in range(retries):
|
|
253
|
+
try:
|
|
254
|
+
result = self._make_request(endpoint, payload, operation, batch_num)
|
|
255
|
+
return result
|
|
256
|
+
except APIError as e:
|
|
257
|
+
if getattr(e, "status_code", None) == 429:
|
|
258
|
+
logging.warning(f"Batch {batch_num} throttled, retrying in {delay}s...")
|
|
259
|
+
time.sleep(delay)
|
|
260
|
+
delay *= 2
|
|
261
|
+
continue
|
|
262
|
+
raise
|
|
263
|
+
except Exception:
|
|
264
|
+
raise
|
|
265
|
+
|
|
266
|
+
def _process_files_in_batches(self, operation: str, user_id: str, vector_lake_description: str, start_from_batch, intelligent_segmentation: bool = False, session_id: Optional[str] = None, files: Optional[List[str]] = None, max_workers: int = 1, batch_delay: float = 2):
|
|
267
|
+
if files is None:
|
|
268
|
+
files = [f for f in os.listdir(self.config.vector_lake_path) if os.path.isfile(os.path.join(self.config.vector_lake_path, f)) and self.file_processor.is_supported_file(f)]
|
|
269
|
+
batches = self.batch_manager.create_batches(files, self.config.vector_lake_path)
|
|
270
|
+
results = []
|
|
271
|
+
start_batch_index = start_from_batch - 1
|
|
272
|
+
if start_from_batch > 1:
|
|
273
|
+
logging.info(f"Resuming from batch {start_from_batch}, skipping first {start_from_batch - 1} batches")
|
|
274
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
275
|
+
futures = {}
|
|
276
|
+
for i, batch in enumerate(batches):
|
|
277
|
+
if i < start_batch_index:
|
|
278
|
+
logging.info(f"Skipping batch {i + 1}")
|
|
279
|
+
continue
|
|
280
|
+
batch_num = i + 1
|
|
281
|
+
file_contents = self._read_files(batch)
|
|
282
|
+
payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description, "files_name": batch, "files_data": file_contents, "intelligent_segmentation": intelligent_segmentation}
|
|
283
|
+
endpoint = self.config.endpoints[operation]
|
|
284
|
+
futures[executor.submit(self._make_request_with_backoff, endpoint, payload, operation, batch_num)] = (batch_num, batch, time.time())
|
|
285
|
+
time.sleep(batch_delay)
|
|
286
|
+
for future in as_completed(futures):
|
|
287
|
+
batch_num, batch, start_time = futures[future]
|
|
288
|
+
try:
|
|
289
|
+
result = future.result()
|
|
290
|
+
processing_time = time.time() - start_time
|
|
291
|
+
logging.info(f"Batch {batch_num} done")
|
|
292
|
+
results.append(BatchResult(batch_number=batch_num, response=result, files_processed=batch, success=True, processing_time=processing_time))
|
|
293
|
+
except Exception as e:
|
|
294
|
+
processing_time = time.time() - start_time
|
|
295
|
+
logging.error(f"Batch {batch_num} failed: {str(e)}")
|
|
296
|
+
results.append(BatchResult(batch_number=batch_num, response=str(e), files_processed=batch, success=False, error_message=str(e), processing_time=processing_time))
|
|
297
|
+
return results
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional, Dict
|
|
3
|
+
|
|
4
|
+
class Config:
|
|
5
|
+
ALLOWED_EXTENSIONS = ["txt", "csv", "json", "py", "docx", "pdf"]
|
|
6
|
+
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
api_key: Optional[str] = None,
|
|
10
|
+
host: str = "https://waveflow-analytics.com",
|
|
11
|
+
timeout: int = 60,
|
|
12
|
+
max_retries: int = 2,
|
|
13
|
+
max_files_per_batch: int = 100,
|
|
14
|
+
max_batch_size_mb: int = 20,
|
|
15
|
+
vector_lake_path: str = "upload",
|
|
16
|
+
log_dir: str = "logs",
|
|
17
|
+
):
|
|
18
|
+
self.api_key = api_key or os.getenv("VECTOR_LAKE_API_KEY")
|
|
19
|
+
self.host = host.rstrip("/")
|
|
20
|
+
self.timeout = timeout
|
|
21
|
+
self.max_retries = max_retries
|
|
22
|
+
self.max_files_per_batch = max_files_per_batch
|
|
23
|
+
self.max_batch_size_mb = max_batch_size_mb
|
|
24
|
+
self.vector_lake_path = vector_lake_path
|
|
25
|
+
self.log_dir = log_dir
|
|
26
|
+
|
|
27
|
+
if not self.api_key:
|
|
28
|
+
raise ValueError("API key is required. Provide api_key or set VECTOR_LAKE_API_KEY environment variable.")
|
|
29
|
+
|
|
30
|
+
os.makedirs(self.log_dir, exist_ok=True)
|
|
31
|
+
os.makedirs(self.vector_lake_path, exist_ok=True)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def base_url_query(self) -> str:
|
|
35
|
+
return f"{self.host}/query"
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def base_url_upload(self) -> str:
|
|
39
|
+
return f"{self.host}/upload"
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def endpoints(self) -> Dict[str, str]:
|
|
43
|
+
return {
|
|
44
|
+
# Query service
|
|
45
|
+
"chat_with_docs": f"{self.base_url_query}/chat_with_docs",
|
|
46
|
+
"top_matching_docs": f"{self.base_url_query}/top_matching_docs",
|
|
47
|
+
"top_matching_docs_with_data": f"{self.base_url_query}/top_matching_docs_with_data",
|
|
48
|
+
"full_corpus_search": f"{self.base_url_query}/full_corpus_search",
|
|
49
|
+
|
|
50
|
+
# Upload service
|
|
51
|
+
"add_docs": f"{self.base_url_upload}/add_docs",
|
|
52
|
+
"refresh_docs": f"{self.base_url_upload}/refresh_docs",
|
|
53
|
+
"health": f"{self.base_url_upload}/health",
|
|
54
|
+
"get_namespace_details_by_userid": f"{self.base_url_upload}/get_namespace_details_by_userid",
|
|
55
|
+
"get_docs_information": f"{self.base_url_upload}/get_docs_information",
|
|
56
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
class VectorLakeError(Exception):
|
|
2
|
+
pass
|
|
3
|
+
|
|
4
|
+
class APIError(VectorLakeError):
|
|
5
|
+
def __init__(self, message: str, status_code: int = None, response_text: str = None):
|
|
6
|
+
super().__init__(message)
|
|
7
|
+
self.status_code = status_code
|
|
8
|
+
self.response_text = response_text
|
|
9
|
+
|
|
10
|
+
class FileProcessingError(VectorLakeError):
|
|
11
|
+
pass
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List, Dict, Any, Optional
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class DocumentInfo:
|
|
7
|
+
filename: str
|
|
8
|
+
content: str
|
|
9
|
+
size: int
|
|
10
|
+
modified_time: Optional[datetime] = None
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ChatResponse:
|
|
14
|
+
response: str
|
|
15
|
+
query: str
|
|
16
|
+
session_id: str
|
|
17
|
+
user_id: str
|
|
18
|
+
timestamp: datetime
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class MatchingDocsResponse:
|
|
22
|
+
status: str
|
|
23
|
+
query: str
|
|
24
|
+
response: str
|
|
25
|
+
with_data: bool
|
|
26
|
+
session_id: str
|
|
27
|
+
user_id: str
|
|
28
|
+
timestamp: datetime
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class HealthResponse:
|
|
32
|
+
status: str
|
|
33
|
+
message: str
|
|
34
|
+
timestamp: datetime
|
|
35
|
+
details: Optional[Dict[str, Any]] = None
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class BatchResult:
|
|
39
|
+
batch_number: int
|
|
40
|
+
response: Any
|
|
41
|
+
files_processed: List[str]
|
|
42
|
+
success: bool
|
|
43
|
+
error_message: Optional[str] = None
|
|
44
|
+
processing_time: Optional[float] = None
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import csv
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
import json
|
|
7
|
+
import traceback
|
|
8
|
+
import time
|
|
9
|
+
from typing import List
|
|
10
|
+
# Heavy libs are imported at runtime when needed in real environments.
|
|
11
|
+
|
|
12
|
+
from .exceptions import FileProcessingError
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
logging.basicConfig(level=logging.INFO)
|
|
16
|
+
|
|
17
|
+
class FileProcessor:
|
|
18
|
+
SUPPORTED_EXTENSIONS = ['txt', 'csv', 'json', 'py', 'docx', 'pdf']
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def read_file_content(filepath: str) -> str:
|
|
22
|
+
ext = filepath.lower().split('.')[-1]
|
|
23
|
+
if ext in ['txt', 'csv', 'py', 'json']:
|
|
24
|
+
with open(filepath, encoding='utf-8') as f:
|
|
25
|
+
return f.read()
|
|
26
|
+
# Defer complex parsing to runtime imports to avoid import-time failures.
|
|
27
|
+
if ext == 'docx':
|
|
28
|
+
try:
|
|
29
|
+
from docx import Document
|
|
30
|
+
except Exception as e:
|
|
31
|
+
raise FileProcessingError(f"python-docx not available: {e}")
|
|
32
|
+
doc = Document(filepath)
|
|
33
|
+
return '\n'.join(p.text for p in doc.paragraphs)
|
|
34
|
+
if ext == 'pdf':
|
|
35
|
+
try:
|
|
36
|
+
import PyPDF2
|
|
37
|
+
except Exception as e:
|
|
38
|
+
raise FileProcessingError(f"PyPDF2 not available: {e}")
|
|
39
|
+
with open(filepath, 'rb') as f:
|
|
40
|
+
reader = PyPDF2.PdfReader(f)
|
|
41
|
+
return '\n'.join(p.extract_text() or '' for p in reader.pages)
|
|
42
|
+
raise FileProcessingError(f'Unsupported extension: {ext}')
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def get_file_size(filepath: str) -> int:
|
|
46
|
+
return os.path.getsize(filepath)
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def is_supported_file(filename: str) -> bool:
|
|
50
|
+
ext = filename.lower().split('.')[-1]
|
|
51
|
+
return ext in FileProcessor.SUPPORTED_EXTENSIONS
|
|
52
|
+
|
|
53
|
+
class Logger:
|
|
54
|
+
def __init__(self, log_dir: str):
|
|
55
|
+
Path(log_dir).mkdir(parents=True, exist_ok=True)
|
|
56
|
+
self.skipped_log = Path(log_dir) / 'skipped_files_log.csv'
|
|
57
|
+
self.api_error_log = Path(log_dir) / 'api_error_log.csv'
|
|
58
|
+
self.performance_log = Path(log_dir) / 'api_performance_log.csv'
|
|
59
|
+
|
|
60
|
+
def _write_csv_log(self, path: Path, header: List[str], row: List):
|
|
61
|
+
exists = path.exists()
|
|
62
|
+
with open(path, 'a', newline='', encoding='utf-8') as f:
|
|
63
|
+
writer = csv.writer(f)
|
|
64
|
+
if not exists:
|
|
65
|
+
writer.writerow(header)
|
|
66
|
+
writer.writerow(row)
|
|
67
|
+
|
|
68
|
+
def log_skipped_file(self, filename: str, reason: str):
|
|
69
|
+
self._write_csv_log(self.skipped_log, ['ts', 'filename', 'reason'], [datetime.utcnow().isoformat(), filename, reason])
|
|
70
|
+
|
|
71
|
+
def log_api_error(self, operation: str, batch_num: int, error_message: str):
|
|
72
|
+
self._write_csv_log(self.api_error_log, ['ts', 'operation', 'batch_num', 'err'], [datetime.utcnow().isoformat(), operation, batch_num, error_message])
|
|
73
|
+
|
|
74
|
+
def log_performance(self, operation=None, batch_num=None, latency=None,
|
|
75
|
+
request_size=None, response_size=None, result_count=None,
|
|
76
|
+
files_processed=None, error: Exception = None):
|
|
77
|
+
"""
|
|
78
|
+
SAFE CSV logger for all SDK operations.
|
|
79
|
+
It will never throw TypeError regardless of input type.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
logfile = getattr(self, "perf_csv", "performance_logs.csv")
|
|
83
|
+
|
|
84
|
+
# --- Safe stringify for ANY type ---
|
|
85
|
+
def safe_str(value):
|
|
86
|
+
try:
|
|
87
|
+
if isinstance(value, (list, tuple, set)):
|
|
88
|
+
return ",".join(str(v) for v in value)
|
|
89
|
+
return str(value) if value is not None else ""
|
|
90
|
+
except Exception:
|
|
91
|
+
return "UNSERIALIZABLE"
|
|
92
|
+
|
|
93
|
+
row = {
|
|
94
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
95
|
+
"operation": safe_str(operation),
|
|
96
|
+
"batch_num": safe_str(batch_num),
|
|
97
|
+
"latency_ms": safe_str(latency),
|
|
98
|
+
"request_size": safe_str(request_size),
|
|
99
|
+
"response_size": safe_str(response_size),
|
|
100
|
+
"result_count": safe_str(result_count),
|
|
101
|
+
"files_processed": safe_str(files_processed),
|
|
102
|
+
"error": safe_str(error),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
file_exists = os.path.isfile(logfile)
|
|
107
|
+
|
|
108
|
+
with open(logfile, mode="a", newline="", encoding="utf-8") as f:
|
|
109
|
+
writer = csv.DictWriter(f, fieldnames=row.keys())
|
|
110
|
+
|
|
111
|
+
if not file_exists:
|
|
112
|
+
writer.writeheader()
|
|
113
|
+
|
|
114
|
+
writer.writerow(row)
|
|
115
|
+
|
|
116
|
+
except Exception:
|
|
117
|
+
print("[CSV LOGGING ERROR] Could not write performance log:")
|
|
118
|
+
print(traceback.format_exc())
|
|
119
|
+
|
|
120
|
+
class BatchManager:
|
|
121
|
+
def __init__(self, max_files: int, max_size_mb: int):
|
|
122
|
+
self.max_files = max_files
|
|
123
|
+
self.max_bytes = max_size_mb * 1024 * 1024
|
|
124
|
+
|
|
125
|
+
def create_batches(self, files: List[str], base_path: str) -> List[List[str]]:
|
|
126
|
+
file_info = []
|
|
127
|
+
for f in files:
|
|
128
|
+
p = os.path.join(base_path, f)
|
|
129
|
+
try:
|
|
130
|
+
size = os.path.getsize(p)
|
|
131
|
+
if size <= self.max_bytes:
|
|
132
|
+
file_info.append((f, size))
|
|
133
|
+
except Exception:
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
batches = []
|
|
137
|
+
cur = []
|
|
138
|
+
cur_size = 0
|
|
139
|
+
for fname, size in file_info:
|
|
140
|
+
if len(cur) >= self.max_files or (cur_size + size) > self.max_bytes:
|
|
141
|
+
if cur:
|
|
142
|
+
batches.append(cur)
|
|
143
|
+
cur = []
|
|
144
|
+
cur_size = 0
|
|
145
|
+
cur.append(fname)
|
|
146
|
+
cur_size += size
|
|
147
|
+
if cur:
|
|
148
|
+
batches.append(cur)
|
|
149
|
+
return batches
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
|
-
Name:
|
|
3
|
-
Version: 0.0.
|
|
2
|
+
Name: waveflowdb_client
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
|
|
5
5
|
Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
|
|
6
6
|
License: MIT License
|
|
@@ -73,7 +73,7 @@ to interact with the Vector Lake API.
|
|
|
73
73
|
### 1. Install Dependencies
|
|
74
74
|
|
|
75
75
|
``` bash
|
|
76
|
-
pip install
|
|
76
|
+
pip install waveflowdb_client
|
|
77
77
|
```
|
|
78
78
|
|
|
79
79
|
### 2. Configure API Credentials
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
pyproject.toml
|
|
3
|
+
readme.md
|
|
4
|
+
waveflowdb_client/__init__.py
|
|
5
|
+
waveflowdb_client/client.py
|
|
6
|
+
waveflowdb_client/config.py
|
|
7
|
+
waveflowdb_client/exceptions.py
|
|
8
|
+
waveflowdb_client/models.py
|
|
9
|
+
waveflowdb_client/utils.py
|
|
10
|
+
waveflowdb_client.egg-info/PKG-INFO
|
|
11
|
+
waveflowdb_client.egg-info/SOURCES.txt
|
|
12
|
+
waveflowdb_client.egg-info/dependency_links.txt
|
|
13
|
+
waveflowdb_client.egg-info/requires.txt
|
|
14
|
+
waveflowdb_client.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
waveflowdb_client
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
run.py
|
|
3
|
-
Simple launcher for Vector Lake SDK (v1.0.0)
|
|
4
|
-
|
|
5
|
-
Allows you to:
|
|
6
|
-
- configure client (host, port, key)
|
|
7
|
-
- call ANY API: add, refresh, chat, match, health, namespace info, etc.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from waveflowdb-client import Config, VectorLakeClient
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
# -------------------------------------------------------
|
|
14
|
-
# CONFIGURATION (EDIT THIS ONCE)
|
|
15
|
-
# -------------------------------------------------------
|
|
16
|
-
API_KEY = "<<>>" # visit https://db.agentanalytics.ai/signup
|
|
17
|
-
HOST = "https://waveflow-analytics.com" # OR "http://localhost"
|
|
18
|
-
VECTOR_LAKE_PATH = "<<>>" # folder for path-based ingestion
|
|
19
|
-
USER_ID = "" ## your email id used for registratoin
|
|
20
|
-
NAMESPACE = "" ## database created via UI
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
# -------------------------------------------------------
|
|
24
|
-
# INITIALIZE CLIENT
|
|
25
|
-
# -------------------------------------------------------
|
|
26
|
-
def get_client():
|
|
27
|
-
cfg = Config(
|
|
28
|
-
api_key=API_KEY,
|
|
29
|
-
host=HOST,
|
|
30
|
-
vector_lake_path=VECTOR_LAKE_PATH
|
|
31
|
-
)
|
|
32
|
-
return VectorLakeClient(cfg)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
client = get_client()
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# -------------------------------------------------------
|
|
39
|
-
# READY-TO-USE ACTION FUNCTIONS
|
|
40
|
-
# -------------------------------------------------------
|
|
41
|
-
|
|
42
|
-
def run_health():
|
|
43
|
-
"""Health check"""
|
|
44
|
-
print("\n--- HEALTH CHECK ---")
|
|
45
|
-
res = client.health_check(USER_ID, NAMESPACE)
|
|
46
|
-
print(res)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def run_add_direct():
|
|
50
|
-
"""Add docs using files_name + files_data"""
|
|
51
|
-
print("\n--- ADD DOCUMENTS (Direct Payload Mode) ---")
|
|
52
|
-
res = client.add_documents(
|
|
53
|
-
user_id=USER_ID,
|
|
54
|
-
vector_lake_description=NAMESPACE,
|
|
55
|
-
files_name=["test1.txt", "test2.txt"],
|
|
56
|
-
files_data=["hello world", "this is test doc 2"]
|
|
57
|
-
)
|
|
58
|
-
print(res)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def run_add_path():
|
|
62
|
-
"""Add docs by reading actual files from disk"""
|
|
63
|
-
print("\n--- ADD DOCUMENTS (Disk Path Mode) ---")
|
|
64
|
-
res = client.add_documents(
|
|
65
|
-
user_id=USER_ID,
|
|
66
|
-
vector_lake_description=NAMESPACE
|
|
67
|
-
# files=[""] # must exist inside VECTOR_LAKE_PATH
|
|
68
|
-
)
|
|
69
|
-
print(res)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def run_refresh_direct():
|
|
73
|
-
"""Refresh docs using direct data (no disk read)"""
|
|
74
|
-
print("\n--- REFRESH DOCUMENTS (Direct Data Mode) ---")
|
|
75
|
-
res = client.refresh_documents(
|
|
76
|
-
user_id=USER_ID,
|
|
77
|
-
vector_lake_description=NAMESPACE,
|
|
78
|
-
files_name=["test1.txt"],
|
|
79
|
-
files_data=["UPDATED CONTENT FOR TEST1"]
|
|
80
|
-
)
|
|
81
|
-
print(res)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def run_refresh_path():
|
|
85
|
-
"""Refresh docs by reading actual files"""
|
|
86
|
-
print("\n--- REFRESH DOCUMENTS (Path Mode) ---")
|
|
87
|
-
res = client.refresh_documents(
|
|
88
|
-
user_id=USER_ID,
|
|
89
|
-
vector_lake_description=NAMESPACE
|
|
90
|
-
# files=["file1.pdf"] # must exist
|
|
91
|
-
)
|
|
92
|
-
print(res)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def run_chat_static(query):
|
|
96
|
-
"""Chat with stored index"""
|
|
97
|
-
print("\n--- CHAT (STATIC MODE) ---")
|
|
98
|
-
res = client.chat_with_docs(
|
|
99
|
-
query=query,
|
|
100
|
-
user_id=USER_ID,
|
|
101
|
-
vector_lake_description=NAMESPACE,
|
|
102
|
-
pattern="static"
|
|
103
|
-
)
|
|
104
|
-
print(res)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def run_chat_dynamic(query):
|
|
108
|
-
"""Chat using temporary files (dynamic mode)"""
|
|
109
|
-
print("\n--- CHAT (DYNAMIC MODE) ---")
|
|
110
|
-
res = client.chat_with_docs(
|
|
111
|
-
query=query,
|
|
112
|
-
user_id=USER_ID,
|
|
113
|
-
vector_lake_description=NAMESPACE,
|
|
114
|
-
pattern="dynamic",
|
|
115
|
-
files_name=["dyn1.txt"],
|
|
116
|
-
files_data=["This is dynamic content to summarize."]
|
|
117
|
-
)
|
|
118
|
-
print(res)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def run_match_static(query):
|
|
122
|
-
"""Top matching docs (static mode)"""
|
|
123
|
-
print("\n--- TOP MATCHING DOCS (STATIC) ---")
|
|
124
|
-
res = client.get_matching_docs(
|
|
125
|
-
query=query,
|
|
126
|
-
user_id=USER_ID,
|
|
127
|
-
vector_lake_description=NAMESPACE,
|
|
128
|
-
pattern="static",
|
|
129
|
-
top_docs=5,
|
|
130
|
-
threshold=0.1
|
|
131
|
-
)
|
|
132
|
-
print(res)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def run_match_dynamic(query):
|
|
136
|
-
"""Top matching docs (dynamic mode)"""
|
|
137
|
-
print("\n--- TOP MATCHING DOCS (DYNAMIC) ---")
|
|
138
|
-
res = client.get_matching_docs(
|
|
139
|
-
query=query,
|
|
140
|
-
user_id=USER_ID,
|
|
141
|
-
vector_lake_description=NAMESPACE,
|
|
142
|
-
pattern="dynamic",
|
|
143
|
-
files_name=["temp.txt"],
|
|
144
|
-
files_data=["Sample dynamic content"]
|
|
145
|
-
)
|
|
146
|
-
print(res)
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def run_match_with_data(query):
|
|
150
|
-
"""Top matching docs including chunk data"""
|
|
151
|
-
print("\n--- TOP MATCHING DOCS (WITH DATA) ---")
|
|
152
|
-
res = client.get_matching_docs(
|
|
153
|
-
query=query,
|
|
154
|
-
user_id=USER_ID,
|
|
155
|
-
vector_lake_description=NAMESPACE,
|
|
156
|
-
pattern="static",
|
|
157
|
-
top_docs=5,
|
|
158
|
-
with_data=True
|
|
159
|
-
)
|
|
160
|
-
print(res)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def run_namespace_details():
|
|
164
|
-
"""Get namespace information"""
|
|
165
|
-
print("\n--- GET NAMESPACE DETAILS ---")
|
|
166
|
-
res = client.get_namespace_details(USER_ID, vector_lake_description=NAMESPACE)
|
|
167
|
-
print(res)
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
def run_docs_info():
|
|
171
|
-
"""List all stored docs + info"""
|
|
172
|
-
print("\n--- GET DOCS INFORMATION ---")
|
|
173
|
-
res = client.get_docs_information(USER_ID, NAMESPACE)
|
|
174
|
-
print(res)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
# -------------------------------------------------------
|
|
178
|
-
# MAIN SELECTOR – RUN ANY FUNCTION YOU WANT
|
|
179
|
-
# -------------------------------------------------------
|
|
180
|
-
if __name__ == "__main__":
|
|
181
|
-
query="<<>>"
|
|
182
|
-
# --- UNCOMMENT ANY ONE OF THESE TO RUN THAT OPERATION ---
|
|
183
|
-
# run_health()
|
|
184
|
-
# run_add_direct()
|
|
185
|
-
# run_add_path()
|
|
186
|
-
# run_refresh_direct()
|
|
187
|
-
# run_refresh_path()
|
|
188
|
-
# run_chat_static(query)
|
|
189
|
-
# run_chat_dynamic(query)
|
|
190
|
-
# run_match_static(query)
|
|
191
|
-
# run_match_dynamic(query)
|
|
192
|
-
# run_match_with_data(query)
|
|
193
|
-
run_namespace_details()
|
|
194
|
-
# run_docs_info()
|
|
195
|
-
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
starter
|
|
File without changes
|
|
File without changes
|
{waveflowdb_client-0.0.1 → waveflowdb_client-0.0.2}/waveflowdb_client.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|