waveflowdb-client 0.0.4__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {waveflowdb_client-0.0.4/waveflowdb_client.egg-info → waveflowdb_client-1.0.0}/PKG-INFO +2 -2
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/pyproject.toml +2 -2
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client/__init__.py +1 -1
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client/client.py +172 -143
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client/config.py +2 -2
- waveflowdb_client-1.0.0/waveflowdb_client/utils.py +551 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0/waveflowdb_client.egg-info}/PKG-INFO +2 -2
- waveflowdb_client-0.0.4/waveflowdb_client/utils.py +0 -149
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/LICENSE +0 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/readme.md +0 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/setup.cfg +0 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client/exceptions.py +0 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client/models.py +0 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client.egg-info/SOURCES.txt +0 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client.egg-info/dependency_links.txt +0 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client.egg-info/requires.txt +0 -0
- {waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: waveflowdb_client
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
|
|
5
5
|
Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
|
|
6
6
|
License: MIT License
|
|
@@ -27,7 +27,7 @@ License: MIT License
|
|
|
27
27
|
|
|
28
28
|
Project-URL: Homepage, https://agentanalytics.ai
|
|
29
29
|
Project-URL: Documentation, https://www.agentanalytics.ai/docs/waveflow-db
|
|
30
|
-
Keywords: vector db,VECTOR QUERY LANGUAGE,
|
|
30
|
+
Keywords: vector db,VECTOR QUERY LANGUAGE,waveflowdb,agentanalytics,VQL
|
|
31
31
|
Requires-Python: >=3.8
|
|
32
32
|
Description-Content-Type: text/markdown
|
|
33
33
|
License-File: LICENSE
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "waveflowdb_client" # pip install name
|
|
7
|
-
version = "0.0
|
|
7
|
+
version = "1.0.0"
|
|
8
8
|
description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
|
|
9
9
|
readme = "readme.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -14,7 +14,7 @@ authors = [
|
|
|
14
14
|
{ name = "agentanalytics.ai", email = "nitin@agentanalytics.ai" }
|
|
15
15
|
]
|
|
16
16
|
|
|
17
|
-
keywords = ["vector db", "VECTOR QUERY LANGUAGE", "
|
|
17
|
+
keywords = ["vector db", "VECTOR QUERY LANGUAGE", "waveflowdb", "agentanalytics", "VQL"]
|
|
18
18
|
|
|
19
19
|
dependencies = [
|
|
20
20
|
"requests",
|
|
@@ -3,13 +3,12 @@ import logging
|
|
|
3
3
|
import json
|
|
4
4
|
import requests
|
|
5
5
|
import os
|
|
6
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
6
|
from typing import List, Optional, Dict, Any
|
|
8
7
|
|
|
9
8
|
from .config import Config
|
|
10
9
|
from .utils import FileProcessor, Logger, BatchManager
|
|
11
|
-
from .exceptions import APIError
|
|
12
|
-
from .models import HealthResponse
|
|
10
|
+
from .exceptions import APIError, FileProcessingError
|
|
11
|
+
from .models import HealthResponse
|
|
13
12
|
|
|
14
13
|
logger = logging.getLogger(__name__)
|
|
15
14
|
logging.basicConfig(level=logging.INFO)
|
|
@@ -24,36 +23,42 @@ class VectorLakeClient:
|
|
|
24
23
|
self.logger = Logger(config.log_dir)
|
|
25
24
|
self.batch_manager = BatchManager(config.max_files_per_batch, config.max_batch_size_mb)
|
|
26
25
|
self.file_processor = FileProcessor()
|
|
27
|
-
self.perf_csv = "performance_logs.csv"
|
|
28
26
|
|
|
29
27
|
def _get_headers(self) -> Dict[str, str]:
|
|
30
|
-
return {
|
|
31
|
-
'Content-Type': 'application/json',
|
|
32
|
-
'x-api-key': self.config.api_key
|
|
33
|
-
}
|
|
28
|
+
return {'Content-Type': 'application/json', 'x-api-key': self.config.api_key}
|
|
34
29
|
|
|
35
30
|
def _make_request(self, endpoint: str, payload: Dict[str, Any], operation: str = "", batch_num: int = 0) -> Dict[str, Any]:
|
|
36
31
|
headers = self._get_headers()
|
|
37
|
-
request_size = len(json.dumps(payload).encode('utf-8')) / 1024 if payload
|
|
32
|
+
request_size = len(json.dumps(payload).encode('utf-8')) / 1024 if payload else 0
|
|
33
|
+
|
|
38
34
|
for attempt in range(self.config.max_retries):
|
|
39
35
|
try:
|
|
40
36
|
start_time = time.time()
|
|
41
37
|
response = requests.post(endpoint, json=payload, headers=headers, timeout=self.config.timeout)
|
|
42
38
|
latency = (time.time() - start_time) * 1000
|
|
39
|
+
|
|
43
40
|
try:
|
|
44
41
|
result = response.json()
|
|
45
42
|
except Exception:
|
|
46
43
|
result = {"status_code": response.status_code, "text": response.text}
|
|
47
44
|
|
|
48
45
|
if operation:
|
|
49
|
-
response_size = len(response.content) / 1024 if response.content
|
|
46
|
+
response_size = len(response.content) / 1024 if response.content else 0
|
|
50
47
|
result_count = len(result.get("results", [])) if isinstance(result, dict) else "N/A"
|
|
51
|
-
self.logger.log_performance(
|
|
48
|
+
self.logger.log_performance(
|
|
49
|
+
operation=operation,
|
|
50
|
+
batch_num=batch_num,
|
|
51
|
+
latency=latency,
|
|
52
|
+
request_size=request_size,
|
|
53
|
+
response_size=response_size,
|
|
54
|
+
result_count=result_count
|
|
55
|
+
)
|
|
52
56
|
|
|
53
57
|
if response.status_code >= 400:
|
|
54
58
|
raise APIError(result.get('message', f'HTTP {response.status_code}'), status_code=response.status_code, response_text=response.text)
|
|
55
59
|
|
|
56
60
|
return result
|
|
61
|
+
|
|
57
62
|
except requests.exceptions.RequestException as e:
|
|
58
63
|
if attempt == self.config.max_retries - 1:
|
|
59
64
|
error_msg = f"Request failed after {self.config.max_retries} attempts: {str(e)}"
|
|
@@ -62,19 +67,38 @@ class VectorLakeClient:
|
|
|
62
67
|
raise APIError(error_msg, getattr(e.response, 'status_code', None), getattr(e.response, 'text', None))
|
|
63
68
|
time.sleep(2 ** attempt)
|
|
64
69
|
|
|
65
|
-
def
|
|
70
|
+
def _make_request_with_backoff(self, endpoint, payload, operation, batch_num, retries=5, base_delay=1):
|
|
71
|
+
delay = base_delay
|
|
72
|
+
for attempt in range(retries):
|
|
73
|
+
try:
|
|
74
|
+
return self._make_request(endpoint, payload, operation, batch_num)
|
|
75
|
+
except APIError as e:
|
|
76
|
+
if getattr(e, "status_code", None) == 429:
|
|
77
|
+
logging.warning(f"Batch {batch_num} throttled, retrying in {delay}s...")
|
|
78
|
+
time.sleep(delay)
|
|
79
|
+
delay *= 2
|
|
80
|
+
continue
|
|
81
|
+
raise
|
|
82
|
+
except Exception:
|
|
83
|
+
raise
|
|
84
|
+
|
|
85
|
+
def _read_files(self, filenames: List[str], chunks_dir: Optional[str] = None) -> List[str]:
|
|
86
|
+
"""
|
|
87
|
+
Reads files safely. If chunks_dir is provided, reads files from that folder.
|
|
88
|
+
"""
|
|
66
89
|
contents = []
|
|
67
|
-
for
|
|
68
|
-
|
|
90
|
+
for fname in filenames:
|
|
91
|
+
path_base = chunks_dir if chunks_dir else self.config.vector_lake_path
|
|
92
|
+
filepath = os.path.join(path_base, fname)
|
|
69
93
|
try:
|
|
70
|
-
if self.file_processor.is_supported_file(
|
|
94
|
+
if self.file_processor.is_supported_file(fname):
|
|
71
95
|
content = self.file_processor.read_file_content(filepath)
|
|
72
96
|
contents.append(content)
|
|
73
97
|
else:
|
|
74
|
-
self.logger.log_skipped_file(
|
|
98
|
+
self.logger.log_skipped_file(fname, "Unsupported file type")
|
|
75
99
|
contents.append("")
|
|
76
100
|
except Exception as e:
|
|
77
|
-
self.logger.log_skipped_file(
|
|
101
|
+
self.logger.log_skipped_file(fname, f"Read error: {str(e)}")
|
|
78
102
|
contents.append("")
|
|
79
103
|
return contents
|
|
80
104
|
|
|
@@ -106,36 +130,23 @@ class VectorLakeClient:
|
|
|
106
130
|
"pattern": pattern
|
|
107
131
|
}
|
|
108
132
|
|
|
109
|
-
#
|
|
133
|
+
# Direct mode
|
|
110
134
|
if files_name and files_data:
|
|
111
135
|
if len(files_name) != len(files_data):
|
|
112
136
|
raise ValueError("files_name and files_data must be same length")
|
|
113
|
-
|
|
114
|
-
# ensure names are sanitized (no full paths)
|
|
115
137
|
clean_names = [os.path.basename(n) for n in files_name]
|
|
116
|
-
payload.update({
|
|
117
|
-
"files_name": clean_names,
|
|
118
|
-
"files_data": files_data,
|
|
119
|
-
"pattern": "dynamic"
|
|
120
|
-
})
|
|
138
|
+
payload.update({"files_name": clean_names, "files_data": files_data, "pattern": "dynamic"})
|
|
121
139
|
return self._make_request(endpoint, payload, endpoint_key)
|
|
122
140
|
|
|
123
|
-
#
|
|
141
|
+
# Dynamic mode: read from filesystem
|
|
124
142
|
if pattern == "dynamic" and files:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
file_contents = self._read_files(
|
|
128
|
-
payload.update({
|
|
129
|
-
"files_name": clean_names,
|
|
130
|
-
"files_data": file_contents,
|
|
131
|
-
"pattern": "dynamic"
|
|
132
|
-
})
|
|
133
|
-
|
|
134
|
-
# debug print — remove or comment out in prod if not needed
|
|
135
|
-
logging.debug("MATCHING DOCS DYNAMIC PAYLOAD: %s", json.dumps(payload, indent=2)[:2000])
|
|
143
|
+
batches, chunks_dir = self.batch_manager.create_batches(files, self.config.vector_lake_path)
|
|
144
|
+
flat_files = [fname for batch in batches for fname in batch]
|
|
145
|
+
file_contents = self._read_files(flat_files, chunks_dir)
|
|
146
|
+
payload.update({"files_name": flat_files, "files_data": file_contents, "pattern": "dynamic"})
|
|
136
147
|
return self._make_request(endpoint, payload, endpoint_key)
|
|
137
148
|
|
|
138
|
-
#
|
|
149
|
+
# Static mode
|
|
139
150
|
return self._make_request(endpoint, payload, endpoint_key)
|
|
140
151
|
|
|
141
152
|
def add_documents(self,
|
|
@@ -149,11 +160,9 @@ class VectorLakeClient:
|
|
|
149
160
|
files_data: Optional[List[str]] = None,
|
|
150
161
|
max_workers=5) -> Any:
|
|
151
162
|
"""
|
|
152
|
-
|
|
153
|
-
Filesystem batch mode returns a 'batch' envelope.
|
|
163
|
+
Add documents either in direct mode (names + data) or batch mode (filesystem).
|
|
154
164
|
"""
|
|
155
|
-
|
|
156
|
-
# Direct mode: user supplied names + data -> single request (raw server response)
|
|
165
|
+
# Direct mode
|
|
157
166
|
if files_name and files_data:
|
|
158
167
|
if len(files_name) != len(files_data):
|
|
159
168
|
raise ValueError("files_name and files_data must be same length")
|
|
@@ -166,13 +175,12 @@ class VectorLakeClient:
|
|
|
166
175
|
"intelligent_segmentation": intelligent_segmentation
|
|
167
176
|
}
|
|
168
177
|
endpoint = self.config.endpoints["add_docs"]
|
|
169
|
-
|
|
170
|
-
result = self._make_request(endpoint, payload, "add_docs", batch_num=1)
|
|
171
|
-
return result
|
|
178
|
+
return self._make_request(endpoint, payload, "add_docs", batch_num=1)
|
|
172
179
|
|
|
173
|
-
# Batch mode
|
|
180
|
+
# Batch mode
|
|
174
181
|
return self._process_files_in_batches(
|
|
175
|
-
"add_docs", user_id, vector_lake_description, start_from_batch,
|
|
182
|
+
"add_docs", user_id, vector_lake_description, start_from_batch,
|
|
183
|
+
intelligent_segmentation, session_id, files, max_workers=max_workers
|
|
176
184
|
)
|
|
177
185
|
|
|
178
186
|
def refresh_documents(self,
|
|
@@ -186,16 +194,11 @@ class VectorLakeClient:
|
|
|
186
194
|
files_data: Optional[List[str]] = None,
|
|
187
195
|
max_workers=5) -> Any:
|
|
188
196
|
"""
|
|
189
|
-
Same semantics as add_documents
|
|
190
|
-
- Direct mode returns raw server response
|
|
191
|
-
- Batch mode returns batch envelope
|
|
197
|
+
Same semantics as add_documents
|
|
192
198
|
"""
|
|
193
|
-
|
|
194
|
-
# Direct mode
|
|
195
199
|
if files_name and files_data:
|
|
196
200
|
if len(files_name) != len(files_data):
|
|
197
201
|
raise ValueError("files_name and files_data must be same length")
|
|
198
|
-
|
|
199
202
|
payload = {
|
|
200
203
|
"session_id": session_id,
|
|
201
204
|
"user_id": user_id,
|
|
@@ -205,23 +208,130 @@ class VectorLakeClient:
|
|
|
205
208
|
"intelligent_segmentation": intelligent_segmentation
|
|
206
209
|
}
|
|
207
210
|
endpoint = self.config.endpoints["refresh_docs"]
|
|
208
|
-
|
|
209
|
-
return result
|
|
211
|
+
return self._make_request(endpoint, payload, "refresh_docs", batch_num=1)
|
|
210
212
|
|
|
211
|
-
# Batch mode — NOTE: call using positional 'operation' arg (operation first)
|
|
212
213
|
return self._process_files_in_batches(
|
|
213
|
-
"refresh_docs", user_id, vector_lake_description, start_from_batch,
|
|
214
|
+
"refresh_docs", user_id, vector_lake_description, start_from_batch,
|
|
215
|
+
intelligent_segmentation, session_id, files, max_workers=max_workers
|
|
214
216
|
)
|
|
215
217
|
|
|
216
|
-
def health_check(
|
|
218
|
+
def health_check(
|
|
219
|
+
self,
|
|
220
|
+
user_id: str,
|
|
221
|
+
vector_lake_description: str,
|
|
222
|
+
session_id: Optional[str] = None
|
|
223
|
+
) -> Dict[str, Any]:
|
|
217
224
|
endpoint = self.config.endpoints["health"]
|
|
218
|
-
|
|
225
|
+
|
|
226
|
+
payload = {
|
|
227
|
+
"user_id": user_id,
|
|
228
|
+
"vector_lake_description": vector_lake_description,
|
|
229
|
+
"session_id": session_id
|
|
230
|
+
}
|
|
231
|
+
|
|
219
232
|
try:
|
|
220
|
-
result = self._make_request(
|
|
221
|
-
|
|
233
|
+
result = self._make_request(
|
|
234
|
+
endpoint=endpoint,
|
|
235
|
+
payload=payload,
|
|
236
|
+
operation="health"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
return result
|
|
240
|
+
|
|
222
241
|
except Exception as e:
|
|
223
|
-
return
|
|
242
|
+
return {
|
|
243
|
+
"status": "error",
|
|
244
|
+
"message": str(e),
|
|
245
|
+
"timestamp": time.time()
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
def _process_files_in_batches(self, operation: str, user_id: str, vector_lake_description: str,
|
|
249
|
+
start_from_batch: int = 1, intelligent_segmentation: bool = False,
|
|
250
|
+
session_id: Optional[str] = None, files: Optional[List[str]] = None,
|
|
251
|
+
max_workers: int = 5, batch_delay: float = 2) -> dict:
|
|
252
|
+
"""
|
|
253
|
+
Processes files from the filesystem in batches using BatchManager.
|
|
254
|
+
Returns a standardized envelope.
|
|
255
|
+
"""
|
|
256
|
+
base_path = self.config.vector_lake_path
|
|
257
|
+
|
|
258
|
+
# 1. Gather files if not provided
|
|
259
|
+
if files is None:
|
|
260
|
+
files = [f for f in os.listdir(base_path)
|
|
261
|
+
if os.path.isfile(os.path.join(base_path, f)) and self.file_processor.is_supported_file(f)]
|
|
224
262
|
|
|
263
|
+
# 2. Create batches using BatchManager
|
|
264
|
+
batches, chunks_dir = self.batch_manager.create_batches(files, base_path)
|
|
265
|
+
logging.info(f"Batches created: {batches}, chunks_dir: {chunks_dir}")
|
|
266
|
+
|
|
267
|
+
batch_outputs = []
|
|
268
|
+
start_index = start_from_batch - 1
|
|
269
|
+
endpoint = self.config.endpoints[operation]
|
|
270
|
+
|
|
271
|
+
for i, batch in enumerate(batches):
|
|
272
|
+
batch_num = i + 1
|
|
273
|
+
|
|
274
|
+
# Skip batches if resuming
|
|
275
|
+
if i < start_index:
|
|
276
|
+
logging.info(f"Skipping batch {batch_num}")
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
start_time = time.time()
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
# 3. Read file contents from chunks folder
|
|
283
|
+
file_contents = []
|
|
284
|
+
for fname in batch:
|
|
285
|
+
full_path = os.path.join(chunks_dir, fname)
|
|
286
|
+
if not os.path.exists(full_path):
|
|
287
|
+
full_path = os.path.join(base_path, fname)
|
|
288
|
+
content = self.file_processor.read_file_content(full_path)
|
|
289
|
+
file_contents.append(content)
|
|
290
|
+
|
|
291
|
+
payload = {
|
|
292
|
+
"session_id": session_id,
|
|
293
|
+
"user_id": user_id,
|
|
294
|
+
"vector_lake_description": vector_lake_description,
|
|
295
|
+
"files_name": batch, # API expects basenames
|
|
296
|
+
"files_data": file_contents,
|
|
297
|
+
"intelligent_segmentation": intelligent_segmentation
|
|
298
|
+
}
|
|
299
|
+
# logging.info(f"payload is {payload}")
|
|
300
|
+
# 4. Make request with backoff
|
|
301
|
+
result = self._make_request_with_backoff(endpoint, payload, operation, batch_num)
|
|
302
|
+
|
|
303
|
+
processing_time = time.time() - start_time
|
|
304
|
+
logging.info(f"Batch {batch_num} done")
|
|
305
|
+
|
|
306
|
+
batch_outputs.append({
|
|
307
|
+
"batch_number": batch_num,
|
|
308
|
+
"files": batch,
|
|
309
|
+
"success": True,
|
|
310
|
+
"processing_time": round(processing_time, 3),
|
|
311
|
+
"response": result
|
|
312
|
+
})
|
|
313
|
+
|
|
314
|
+
except Exception as e:
|
|
315
|
+
processing_time = time.time() - start_time
|
|
316
|
+
logging.error(f"Batch {batch_num} failed: {str(e)}")
|
|
317
|
+
batch_outputs.append({
|
|
318
|
+
"batch_number": batch_num,
|
|
319
|
+
"files": batch,
|
|
320
|
+
"success": False,
|
|
321
|
+
"processing_time": round(processing_time, 3),
|
|
322
|
+
"response": None,
|
|
323
|
+
"error": str(e)
|
|
324
|
+
})
|
|
325
|
+
|
|
326
|
+
# 5. Delay between batches
|
|
327
|
+
time.sleep(batch_delay)
|
|
328
|
+
|
|
329
|
+
return {
|
|
330
|
+
"mode": "batch",
|
|
331
|
+
"total_batches": len(batches),
|
|
332
|
+
"batches": sorted(batch_outputs, key=lambda x: x["batch_number"])
|
|
333
|
+
}
|
|
334
|
+
|
|
225
335
|
def get_namespace_details(self, user_id: str, session_id: Optional[str] = None, vector_lake_description: Optional[str] = None) -> Dict[str, Any]:
|
|
226
336
|
endpoint = self.config.endpoints["get_namespace_details_by_userid"]
|
|
227
337
|
payload = {"session_id": session_id, "user_id": user_id}
|
|
@@ -252,84 +362,3 @@ class VectorLakeClient:
|
|
|
252
362
|
return result
|
|
253
363
|
except Exception as e:
|
|
254
364
|
return {"status": "error", "message": str(e)}
|
|
255
|
-
|
|
256
|
-
def _make_request_with_backoff(self, endpoint, payload, operation, batch_num, retries=5, base_delay=1):
|
|
257
|
-
delay = base_delay
|
|
258
|
-
for attempt in range(retries):
|
|
259
|
-
try:
|
|
260
|
-
result = self._make_request(endpoint, payload, operation, batch_num)
|
|
261
|
-
return result
|
|
262
|
-
except APIError as e:
|
|
263
|
-
if getattr(e, "status_code", None) == 429:
|
|
264
|
-
logging.warning(f"Batch {batch_num} throttled, retrying in {delay}s...")
|
|
265
|
-
time.sleep(delay)
|
|
266
|
-
delay *= 2
|
|
267
|
-
continue
|
|
268
|
-
raise
|
|
269
|
-
except Exception:
|
|
270
|
-
raise
|
|
271
|
-
|
|
272
|
-
def _process_files_in_batches(self, operation: str, user_id: str, vector_lake_description: str, start_from_batch, intelligent_segmentation: bool = False, session_id: Optional[str] = None, files: Optional[List[str]] = None, max_workers: int = 1, batch_delay: float = 2) -> Dict[str, Any]:
|
|
273
|
-
"""
|
|
274
|
-
Processes files from the filesystem in batches and returns a standardized
|
|
275
|
-
envelope:
|
|
276
|
-
{
|
|
277
|
-
"mode": "batch",
|
|
278
|
-
"total_batches": N,
|
|
279
|
-
"batches": [
|
|
280
|
-
{ "batch_number": i, "files": [...], "success": True, "processing_time": x, "response": {...} },
|
|
281
|
-
...
|
|
282
|
-
]
|
|
283
|
-
}
|
|
284
|
-
"""
|
|
285
|
-
if files is None:
|
|
286
|
-
files = [f for f in os.listdir(self.config.vector_lake_path) if os.path.isfile(os.path.join(self.config.vector_lake_path, f)) and self.file_processor.is_supported_file(f)]
|
|
287
|
-
|
|
288
|
-
batches = self.batch_manager.create_batches(files, self.config.vector_lake_path)
|
|
289
|
-
batch_outputs: List[Dict[str, Any]] = []
|
|
290
|
-
start_batch_index = start_from_batch - 1
|
|
291
|
-
if start_from_batch > 1:
|
|
292
|
-
logging.info(f"Resuming from batch {start_from_batch}, skipping first {start_from_batch - 1} batches")
|
|
293
|
-
|
|
294
|
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
295
|
-
futures = {}
|
|
296
|
-
for i, batch in enumerate(batches):
|
|
297
|
-
if i < start_batch_index:
|
|
298
|
-
logging.info(f"Skipping batch {i + 1}")
|
|
299
|
-
continue
|
|
300
|
-
batch_num = i + 1
|
|
301
|
-
file_contents = self._read_files(batch)
|
|
302
|
-
payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description, "files_name": batch, "files_data": file_contents, "intelligent_segmentation": intelligent_segmentation}
|
|
303
|
-
endpoint = self.config.endpoints[operation]
|
|
304
|
-
futures[executor.submit(self._make_request_with_backoff, endpoint, payload, operation, batch_num)] = (batch_num, batch, time.time())
|
|
305
|
-
time.sleep(batch_delay)
|
|
306
|
-
|
|
307
|
-
for future in as_completed(futures):
|
|
308
|
-
batch_num, batch, start_time = futures[future]
|
|
309
|
-
processing_time = time.time() - start_time
|
|
310
|
-
try:
|
|
311
|
-
result = future.result()
|
|
312
|
-
logging.info(f"Batch {batch_num} done")
|
|
313
|
-
batch_outputs.append({
|
|
314
|
-
"batch_number": batch_num,
|
|
315
|
-
"files": batch,
|
|
316
|
-
"success": True,
|
|
317
|
-
"processing_time": round(processing_time, 3),
|
|
318
|
-
"response": result
|
|
319
|
-
})
|
|
320
|
-
except Exception as e:
|
|
321
|
-
logging.error(f"Batch {batch_num} failed: {str(e)}")
|
|
322
|
-
batch_outputs.append({
|
|
323
|
-
"batch_number": batch_num,
|
|
324
|
-
"files": batch,
|
|
325
|
-
"success": False,
|
|
326
|
-
"processing_time": round(processing_time, 3),
|
|
327
|
-
"response": None,
|
|
328
|
-
"error": str(e)
|
|
329
|
-
})
|
|
330
|
-
|
|
331
|
-
return {
|
|
332
|
-
"mode": "batch",
|
|
333
|
-
"total_batches": len(batch_outputs),
|
|
334
|
-
"batches": sorted(batch_outputs, key=lambda x: x["batch_number"])
|
|
335
|
-
}
|
|
@@ -8,10 +8,10 @@ class Config:
|
|
|
8
8
|
self,
|
|
9
9
|
api_key: Optional[str] = None,
|
|
10
10
|
host: str = "https://waveflow-analytics.com",
|
|
11
|
-
timeout: int =
|
|
11
|
+
timeout: int = 240,
|
|
12
12
|
max_retries: int = 2,
|
|
13
13
|
max_files_per_batch: int = 100,
|
|
14
|
-
max_batch_size_mb: int =
|
|
14
|
+
max_batch_size_mb: int = 1,
|
|
15
15
|
vector_lake_path: str = "upload",
|
|
16
16
|
log_dir: str = "logs",
|
|
17
17
|
service_port: int = None,
|
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import csv
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
import json
|
|
7
|
+
import shutil
|
|
8
|
+
from typing import List,Tuple
|
|
9
|
+
|
|
10
|
+
from .exceptions import FileProcessingError
|
|
11
|
+
|
|
12
|
+
# logger = logging.getLogger(__name__)
|
|
13
|
+
# logging.basicConfig(level=logging.INFO)
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
# Configure the logging settings
|
|
18
|
+
logging.basicConfig(
|
|
19
|
+
level=logging.DEBUG, # Set to INFO to see only high-level actions, or DEBUG for detail
|
|
20
|
+
format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
|
21
|
+
filename='batch_creation.log', # Log output to a file
|
|
22
|
+
filemode='w' # Overwrite the log file each run
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# If you also want output to the console:
|
|
26
|
+
console_handler = logging.StreamHandler()
|
|
27
|
+
console_handler.setLevel(logging.INFO)
|
|
28
|
+
formatter = logging.Formatter('%(levelname)s: %(message)s')
|
|
29
|
+
console_handler.setFormatter(formatter)
|
|
30
|
+
logging.getLogger().addHandler(console_handler)
|
|
31
|
+
|
|
32
|
+
# Get the logger instance for your class (if needed, though basicConfig covers the root)
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
# -------------------------------------------------------------------------
|
|
36
|
+
# UNIVERSAL HELPERS FOR FILE CONVERSION & CHUNKING
|
|
37
|
+
# -------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
def paragraph_chunk(text: str, max_chars: int) -> list:
|
|
40
|
+
"""
|
|
41
|
+
Strict paragraph-based chunking.
|
|
42
|
+
No paragraph is ever split.
|
|
43
|
+
"""
|
|
44
|
+
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
|
|
45
|
+
chunks = []
|
|
46
|
+
current = ""
|
|
47
|
+
|
|
48
|
+
for p in paragraphs:
|
|
49
|
+
if len(current) + len(p) + 2 > max_chars:
|
|
50
|
+
if current:
|
|
51
|
+
chunks.append(current.strip())
|
|
52
|
+
current = p + "\n\n"
|
|
53
|
+
else:
|
|
54
|
+
current += p + "\n\n"
|
|
55
|
+
|
|
56
|
+
if current.strip():
|
|
57
|
+
chunks.append(current.strip())
|
|
58
|
+
|
|
59
|
+
return chunks
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def json_to_csv(json_path: str) -> str:
|
|
63
|
+
out_path = json_path.replace(".json", ".csv")
|
|
64
|
+
|
|
65
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
66
|
+
data = json.load(f)
|
|
67
|
+
|
|
68
|
+
if not isinstance(data, list):
|
|
69
|
+
raise FileProcessingError("JSON must be an array of objects.")
|
|
70
|
+
|
|
71
|
+
# Extract all keys across JSON objects
|
|
72
|
+
headers = sorted({k for row in data for k in row.keys()})
|
|
73
|
+
|
|
74
|
+
with open(out_path, "w", newline="", encoding="utf-8") as csvfile:
|
|
75
|
+
writer = csv.DictWriter(csvfile, fieldnames=headers)
|
|
76
|
+
writer.writeheader()
|
|
77
|
+
writer.writerows(data)
|
|
78
|
+
|
|
79
|
+
return out_path
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def convert_to_txt(file_path: str) -> str:
|
|
83
|
+
base, _ = os.path.splitext(file_path)
|
|
84
|
+
out_path = base + ".txt"
|
|
85
|
+
|
|
86
|
+
if file_path.endswith(".py"):
|
|
87
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
88
|
+
content = f.read()
|
|
89
|
+
|
|
90
|
+
elif file_path.endswith(".ipynb"):
|
|
91
|
+
import json
|
|
92
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
93
|
+
nb = json.load(f)
|
|
94
|
+
content = "\n\n".join(
|
|
95
|
+
cell["source"] if isinstance(cell["source"], str)
|
|
96
|
+
else "".join(cell["source"])
|
|
97
|
+
for cell in nb.get("cells", []) if cell.get("cell_type") == "code"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
else:
|
|
101
|
+
raise FileProcessingError("Unsupported file for text conversion")
|
|
102
|
+
|
|
103
|
+
with open(out_path, "w", encoding="utf-8") as out:
|
|
104
|
+
out.write(content)
|
|
105
|
+
|
|
106
|
+
return out_path
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def extract_text(file_path: str) -> str:
|
|
110
|
+
"""
|
|
111
|
+
Extract text from PDF, DOCX, or TXT.
|
|
112
|
+
"""
|
|
113
|
+
ext = file_path.lower().split(".")[-1]
|
|
114
|
+
|
|
115
|
+
if ext == "pdf":
|
|
116
|
+
try:
|
|
117
|
+
import PyPDF2
|
|
118
|
+
except Exception as e:
|
|
119
|
+
raise FileProcessingError(f"PyPDF2 not available: {e}")
|
|
120
|
+
text = ""
|
|
121
|
+
with open(file_path, "rb") as f:
|
|
122
|
+
reader = PyPDF2.PdfReader(f)
|
|
123
|
+
for p in reader.pages:
|
|
124
|
+
text += (p.extract_text() or "") + "\n\n"
|
|
125
|
+
return text
|
|
126
|
+
|
|
127
|
+
if ext == "docx":
|
|
128
|
+
try:
|
|
129
|
+
from docx import Document
|
|
130
|
+
except Exception as e:
|
|
131
|
+
raise FileProcessingError(f"python-docx not available: {e}")
|
|
132
|
+
doc = Document(file_path)
|
|
133
|
+
return "\n\n".join(p.text for p in doc.paragraphs)
|
|
134
|
+
|
|
135
|
+
if ext == "txt":
|
|
136
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
137
|
+
return f.read()
|
|
138
|
+
|
|
139
|
+
raise FileProcessingError("Unsupported extraction type.")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def split_csv_if_needed(file_path: str, max_bytes: int) -> list:
|
|
143
|
+
"""
|
|
144
|
+
Split CSV into smaller parts using 20% safety buffer.
|
|
145
|
+
No partial rows.
|
|
146
|
+
"""
|
|
147
|
+
SAFE_LIMIT = int(max_bytes * 0.8)
|
|
148
|
+
size = os.path.getsize(file_path)
|
|
149
|
+
|
|
150
|
+
if size <= SAFE_LIMIT:
|
|
151
|
+
return [file_path]
|
|
152
|
+
|
|
153
|
+
base, ext = os.path.splitext(file_path)
|
|
154
|
+
output_files = []
|
|
155
|
+
|
|
156
|
+
with open(file_path, "r", newline="", encoding="utf-8") as f:
|
|
157
|
+
reader = csv.reader(f)
|
|
158
|
+
header = next(reader)
|
|
159
|
+
header_bytes = len(",".join(header).encode("utf-8"))
|
|
160
|
+
|
|
161
|
+
part = 1
|
|
162
|
+
rows = []
|
|
163
|
+
current_size = header_bytes
|
|
164
|
+
|
|
165
|
+
for row in reader:
|
|
166
|
+
row_bytes = len(",".join(row).encode("utf-8"))
|
|
167
|
+
|
|
168
|
+
if row_bytes > SAFE_LIMIT:
|
|
169
|
+
raise FileProcessingError(f"A CSV row exceeds the safe max size.")
|
|
170
|
+
|
|
171
|
+
if current_size + row_bytes > SAFE_LIMIT:
|
|
172
|
+
out_file = f"{base}_part_{part}{ext}"
|
|
173
|
+
with open(out_file, "w", newline="", encoding="utf-8") as out:
|
|
174
|
+
writer = csv.writer(out)
|
|
175
|
+
writer.writerow(header)
|
|
176
|
+
writer.writerows(rows)
|
|
177
|
+
output_files.append(out_file)
|
|
178
|
+
|
|
179
|
+
part += 1
|
|
180
|
+
rows = []
|
|
181
|
+
current_size = header_bytes
|
|
182
|
+
|
|
183
|
+
rows.append(row)
|
|
184
|
+
current_size += row_bytes
|
|
185
|
+
|
|
186
|
+
# last chunk
|
|
187
|
+
if rows:
|
|
188
|
+
out_file = f"{base}_part_{part}{ext}"
|
|
189
|
+
with open(out_file, "w", newline="", encoding="utf-8") as out:
|
|
190
|
+
writer = csv.writer(out)
|
|
191
|
+
writer.writerow(header)
|
|
192
|
+
writer.writerows(rows)
|
|
193
|
+
output_files.append(out_file)
|
|
194
|
+
|
|
195
|
+
return output_files
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# -------------------------------------------------------------------------
|
|
199
|
+
# FILE PROCESSOR
|
|
200
|
+
# -------------------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
class FileProcessor:
|
|
203
|
+
SUPPORTED_EXTENSIONS = ['txt', 'csv', 'json', 'py', 'docx', 'pdf', 'ipynb']
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def read_file_content(filepath: str) -> str:
|
|
207
|
+
ext = filepath.lower().split('.')[-1]
|
|
208
|
+
if ext in ['txt', 'csv', 'py', 'json']:
|
|
209
|
+
with open(filepath, encoding='utf-8') as f:
|
|
210
|
+
return f.read()
|
|
211
|
+
if ext == 'docx':
|
|
212
|
+
from docx import Document
|
|
213
|
+
doc = Document(filepath)
|
|
214
|
+
return '\n'.join(p.text for p in doc.paragraphs)
|
|
215
|
+
if ext == 'pdf':
|
|
216
|
+
import PyPDF2
|
|
217
|
+
with open(filepath, 'rb') as f:
|
|
218
|
+
reader = PyPDF2.PdfReader(f)
|
|
219
|
+
return '\n'.join(p.extract_text() or '' for p in reader.pages)
|
|
220
|
+
raise FileProcessingError(f'Unsupported extension: {ext}')
|
|
221
|
+
|
|
222
|
+
@staticmethod
|
|
223
|
+
def get_file_size(filepath: str) -> int:
|
|
224
|
+
return os.path.getsize(filepath)
|
|
225
|
+
|
|
226
|
+
@staticmethod
|
|
227
|
+
def is_supported_file(filename: str) -> bool:
|
|
228
|
+
ext = filename.lower().split('.')[-1]
|
|
229
|
+
return ext in FileProcessor.SUPPORTED_EXTENSIONS
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# -------------------------------------------------------------------------
|
|
233
|
+
# LOGGER
|
|
234
|
+
# -------------------------------------------------------------------------
|
|
235
|
+
|
|
236
|
+
class Logger:
|
|
237
|
+
def __init__(self, log_dir: str):
|
|
238
|
+
Path(log_dir).mkdir(parents=True, exist_ok=True)
|
|
239
|
+
self.skipped_log = Path(log_dir) / 'skipped_files_log.csv'
|
|
240
|
+
self.api_error_log = Path(log_dir) / 'api_error_log.csv'
|
|
241
|
+
self.performance_log = Path(log_dir) / 'api_performance_log.csv'
|
|
242
|
+
|
|
243
|
+
def _write_csv_log(self, path: Path, header: List[str], row: List):
|
|
244
|
+
exists = path.exists()
|
|
245
|
+
with open(path, 'a', newline='', encoding='utf-8') as f:
|
|
246
|
+
writer = csv.writer(f)
|
|
247
|
+
if not exists:
|
|
248
|
+
writer.writerow(header)
|
|
249
|
+
writer.writerow(row)
|
|
250
|
+
|
|
251
|
+
def log_skipped_file(self, filename: str, reason: str):
|
|
252
|
+
self._write_csv_log(self.skipped_log,
|
|
253
|
+
['ts', 'filename', 'reason'],
|
|
254
|
+
[datetime.utcnow().isoformat(), filename, reason])
|
|
255
|
+
|
|
256
|
+
def log_api_error(self, operation: str, batch_num: int, error_message: str):
|
|
257
|
+
self._write_csv_log(self.api_error_log,
|
|
258
|
+
['ts', 'operation', 'batch_num', 'err'],
|
|
259
|
+
[datetime.utcnow().isoformat(), operation, batch_num, error_message])
|
|
260
|
+
|
|
261
|
+
def log_performance(self, **kwargs):
|
|
262
|
+
"""
|
|
263
|
+
SAFE CSV logger for performance metrics.
|
|
264
|
+
Accepts ANY keyword args and serializes them safely.
|
|
265
|
+
"""
|
|
266
|
+
logfile = self.performance_log
|
|
267
|
+
|
|
268
|
+
# Start row with timestamp
|
|
269
|
+
row = {"timestamp": datetime.utcnow().isoformat()}
|
|
270
|
+
|
|
271
|
+
# Safely encode all user-provided fields
|
|
272
|
+
for k, v in kwargs.items():
|
|
273
|
+
try:
|
|
274
|
+
if isinstance(v, (list, tuple, set)):
|
|
275
|
+
row[k] = ",".join(str(i) for i in v)
|
|
276
|
+
else:
|
|
277
|
+
row[k] = str(v) if v is not None else ""
|
|
278
|
+
except Exception:
|
|
279
|
+
row[k] = "UNSERIALIZABLE"
|
|
280
|
+
|
|
281
|
+
# Write row
|
|
282
|
+
file_exists = logfile.exists()
|
|
283
|
+
with open(logfile, "a", newline="", encoding="utf-8") as f:
|
|
284
|
+
writer = csv.DictWriter(f, fieldnames=row.keys())
|
|
285
|
+
if not file_exists:
|
|
286
|
+
writer.writeheader()
|
|
287
|
+
writer.writerow(row)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# -------------------------------------------------------------------------
|
|
291
|
+
# BATCH MANAGER (UPDATED)
|
|
292
|
+
# -------------------------------------------------------------------------
|
|
293
|
+
class BatchManager:
|
|
294
|
+
def __init__(self, max_files: int, max_size_mb: int):
|
|
295
|
+
self.max_files = max_files
|
|
296
|
+
self.max_bytes = max_size_mb * 1024 * 1024 # convert MB to bytes
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def create_batches(self, files: List[str], base_path: str) -> Tuple[List[List[str]], str]:
|
|
300
|
+
"""
|
|
301
|
+
Process files into batches based on max files & max bytes, with detailed logging.
|
|
302
|
+
"""
|
|
303
|
+
|
|
304
|
+
# Log initial parameters
|
|
305
|
+
logger.info(f"Starting batch creation. Max files: {self.max_files}, Max bytes: {self.max_bytes}")
|
|
306
|
+
|
|
307
|
+
SAFE_LIMIT = int(self.max_bytes * 0.8)
|
|
308
|
+
logger.debug(f"Calculated SAFE_LIMIT (80% of max_bytes): {SAFE_LIMIT} bytes")
|
|
309
|
+
|
|
310
|
+
chunks_dir = os.path.join(base_path, "chunks")
|
|
311
|
+
os.makedirs(chunks_dir, exist_ok=True)
|
|
312
|
+
logger.info(f"Chunks directory created: {chunks_dir}")
|
|
313
|
+
|
|
314
|
+
file_info = []
|
|
315
|
+
|
|
316
|
+
# -----------------------------
|
|
317
|
+
# Step 1: Preprocess files
|
|
318
|
+
# -----------------------------
|
|
319
|
+
logger.info("--- Step 1: Preprocessing and Chunking Files ---")
|
|
320
|
+
|
|
321
|
+
for f in files:
|
|
322
|
+
filepath = os.path.join(base_path, f)
|
|
323
|
+
|
|
324
|
+
if not os.path.isfile(filepath):
|
|
325
|
+
logger.warning(f"File not found, skipping: {filepath}")
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
ext = f.lower().split(".")[-1]
|
|
329
|
+
logger.debug(f"Processing file: {f}, detected extension: {ext}")
|
|
330
|
+
|
|
331
|
+
# JSON → CSV
|
|
332
|
+
if ext == "json":
|
|
333
|
+
original_path = filepath
|
|
334
|
+
filepath = json_to_csv(filepath)
|
|
335
|
+
ext = "csv"
|
|
336
|
+
logger.info(f"Converted JSON file {original_path} to CSV at {filepath}")
|
|
337
|
+
|
|
338
|
+
# PY/IPYNB → TXT
|
|
339
|
+
elif ext in ("py", "ipynb"):
|
|
340
|
+
original_path = filepath
|
|
341
|
+
filepath = convert_to_txt(filepath)
|
|
342
|
+
ext = "txt"
|
|
343
|
+
logger.info(f"Converted code file {original_path} to TXT at {filepath}")
|
|
344
|
+
|
|
345
|
+
# PDF/DOCX/TXT → paragraph chunks if needed
|
|
346
|
+
if ext in ("pdf", "docx", "txt") and ext != "csv":
|
|
347
|
+
text = extract_text(filepath)
|
|
348
|
+
chunks = paragraph_chunk(text, max_chars=SAFE_LIMIT)
|
|
349
|
+
base_fname, _ = os.path.splitext(os.path.basename(filepath))
|
|
350
|
+
part_files = []
|
|
351
|
+
|
|
352
|
+
for i, block in enumerate(chunks, 1):
|
|
353
|
+
out_file = os.path.join(chunks_dir, f"{base_fname}_part_{i}.txt")
|
|
354
|
+
with open(out_file, "w", encoding="utf-8") as out:
|
|
355
|
+
out.write(block)
|
|
356
|
+
size = os.path.getsize(out_file)
|
|
357
|
+
part_files.append((out_file, size))
|
|
358
|
+
logger.debug(f"Chunked file {f} into part {i}: {os.path.basename(out_file)}, Size: {size} bytes")
|
|
359
|
+
|
|
360
|
+
file_info.extend(part_files)
|
|
361
|
+
logger.info(f"Successfully chunked {f} into {len(chunks)} parts.")
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
# CSV → safe split
|
|
365
|
+
if ext == "csv":
|
|
366
|
+
parts = self._split_csv_safe(filepath, SAFE_LIMIT, chunks_dir)
|
|
367
|
+
sizes = [os.path.getsize(p) for p in parts]
|
|
368
|
+
file_info.extend(list(zip(parts, sizes)))
|
|
369
|
+
logger.info(f"Successfully split CSV file {f} into {len(parts)} safe chunks.")
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# Other small files → move to chunks folder or chunk if large
|
|
373
|
+
size = os.path.getsize(filepath)
|
|
374
|
+
if size > SAFE_LIMIT:
|
|
375
|
+
logger.warning(f"File {f} ({size} bytes) exceeds SAFE_LIMIT ({SAFE_LIMIT} bytes). Attempting generic TXT chunking.")
|
|
376
|
+
try:
|
|
377
|
+
with open(filepath, "r", encoding="utf-8", errors="ignore") as f_in:
|
|
378
|
+
content = f_in.read()
|
|
379
|
+
|
|
380
|
+
chunks = paragraph_chunk(content, max_chars=SAFE_LIMIT)
|
|
381
|
+
base_fname, _ = os.path.splitext(os.path.basename(filepath))
|
|
382
|
+
for i, block in enumerate(chunks, 1):
|
|
383
|
+
out_file = os.path.join(chunks_dir, f"{base_fname}_part_{i}.txt")
|
|
384
|
+
with open(out_file, "w", encoding="utf-8") as out:
|
|
385
|
+
out.write(block)
|
|
386
|
+
file_info.append((out_file, os.path.getsize(out_file)))
|
|
387
|
+
logger.debug(f"Generic chunked file {f} into part {i}: {os.path.basename(out_file)}")
|
|
388
|
+
|
|
389
|
+
except Exception as e:
|
|
390
|
+
logger.error(f"Failed to chunk large file {filepath} as TXT. Error: {e}")
|
|
391
|
+
raise FileProcessingError(f"File {filepath} exceeds batch limit and cannot be chunked")
|
|
392
|
+
|
|
393
|
+
else:
|
|
394
|
+
# Copy small files to chunks directory
|
|
395
|
+
new_path = os.path.join(chunks_dir, os.path.basename(filepath))
|
|
396
|
+
if not os.path.exists(new_path):
|
|
397
|
+
shutil.copy2(filepath, new_path)
|
|
398
|
+
file_info.append((new_path, size))
|
|
399
|
+
logger.debug(f"Copied small file {f} to chunks_dir. Size: {size} bytes.")
|
|
400
|
+
|
|
401
|
+
logger.info(f"Preprocessing complete. Total items ready for batching: {len(file_info)}")
|
|
402
|
+
|
|
403
|
+
# -----------------------------
|
|
404
|
+
# Step 2: Create dynamic batches
|
|
405
|
+
# -----------------------------
|
|
406
|
+
logger.info("--- Step 2: Creating Dynamic Batches ---")
|
|
407
|
+
batches = []
|
|
408
|
+
cur_batch = []
|
|
409
|
+
cur_size = 0
|
|
410
|
+
|
|
411
|
+
for filepath, size in file_info:
|
|
412
|
+
filename = os.path.basename(filepath)
|
|
413
|
+
logger.debug(f"Considering file: {filename}, Size: {size} bytes. Current batch size: {cur_size} bytes, files: {len(cur_batch)}")
|
|
414
|
+
|
|
415
|
+
# Absolute safety check (shouldn't happen if Step 1 worked)
|
|
416
|
+
if size > self.max_bytes:
|
|
417
|
+
logger.critical(f"Chunked file {filename} ({size} bytes) still exceeds max batch size ({self.max_bytes} bytes).")
|
|
418
|
+
raise FileProcessingError(f"File {filepath} exceeds max batch size ({self.max_bytes} bytes)")
|
|
419
|
+
|
|
420
|
+
# Flush current batch if limits exceeded
|
|
421
|
+
if cur_batch and (len(cur_batch) >= self.max_files or (cur_size + size) > self.max_bytes):
|
|
422
|
+
logger.info(f"Batch limit hit. Flushing batch with {len(cur_batch)} files and {cur_size} bytes.")
|
|
423
|
+
batches.append(cur_batch)
|
|
424
|
+
cur_batch = []
|
|
425
|
+
cur_size = 0
|
|
426
|
+
|
|
427
|
+
cur_batch.append(filename)
|
|
428
|
+
cur_size += size
|
|
429
|
+
logger.debug(f"Added file {filename}. New batch size: {cur_size} bytes, files: {len(cur_batch)}")
|
|
430
|
+
|
|
431
|
+
if cur_batch:
|
|
432
|
+
logger.info(f"Flushing final batch with {len(cur_batch)} files and {cur_size} bytes.")
|
|
433
|
+
batches.append(cur_batch)
|
|
434
|
+
|
|
435
|
+
logger.info(f"Batch creation complete. Total batches created: {len(batches)}")
|
|
436
|
+
|
|
437
|
+
return batches, chunks_dir
|
|
438
|
+
|
|
439
|
+
def _split_csv_safe(self, file_path: str, safe_limit: int, chunks_dir: str, preferred_encoding: str = "utf-8") -> List[str]:
|
|
440
|
+
"""
|
|
441
|
+
Split CSV safely into chunks respecting row boundaries.
|
|
442
|
+
Tries multiple common encodings if the preferred one fails, ensuring robustness
|
|
443
|
+
across different operating systems and file origins (e.g., handling cp1252/Latin-1).
|
|
444
|
+
Includes an aggressive fallback to 'cp1252' if 'utf-8' is suspected of failing deep within the file.
|
|
445
|
+
Uses 'errors='replace'' in the final read step for maximum stability against corrupted bytes.
|
|
446
|
+
|
|
447
|
+
Returns a list of chunk file paths inside chunks_dir.
|
|
448
|
+
"""
|
|
449
|
+
input_file = Path(file_path)
|
|
450
|
+
output_dir = Path(chunks_dir)
|
|
451
|
+
output_files = []
|
|
452
|
+
|
|
453
|
+
# 1. Ensure the output directory exists
|
|
454
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
455
|
+
|
|
456
|
+
base_fname = input_file.stem
|
|
457
|
+
ext = input_file.suffix
|
|
458
|
+
|
|
459
|
+
# 2. Robust Encoding Handling: Determine the correct encoding
|
|
460
|
+
encodings_to_try = [preferred_encoding, "cp1252", "latin-1"]
|
|
461
|
+
effective_encoding = None
|
|
462
|
+
|
|
463
|
+
for encoding in encodings_to_try:
|
|
464
|
+
try:
|
|
465
|
+
# Attempt to open and read only the header to test the encoding.
|
|
466
|
+
# We do not use 'errors' here to ensure we only accept encodings that cleanly read the header.
|
|
467
|
+
with open(input_file, "r", newline="", encoding=encoding) as f_test:
|
|
468
|
+
test_reader = csv.reader(f_test)
|
|
469
|
+
# Attempt to read the first row (header)
|
|
470
|
+
header_test = next(test_reader)
|
|
471
|
+
|
|
472
|
+
effective_encoding = encoding
|
|
473
|
+
break
|
|
474
|
+
except UnicodeDecodeError:
|
|
475
|
+
print(f"Failed to decode with {encoding}. Trying next encoding...")
|
|
476
|
+
continue
|
|
477
|
+
except StopIteration:
|
|
478
|
+
raise FileProcessingError(f"CSV file {file_path} is empty.")
|
|
479
|
+
except Exception as e:
|
|
480
|
+
raise FileProcessingError(f"Error opening or reading file {file_path}: {e}")
|
|
481
|
+
|
|
482
|
+
# AGGRESSIVE FALLBACK LOGIC:
|
|
483
|
+
# If the file appears to be UTF-8 based on the header check, but is running on a Windows environment ('nt'),
|
|
484
|
+
# we preemptively switch to cp1252 to handle deep file corruption caused by non-UTF-8 characters.
|
|
485
|
+
if effective_encoding == preferred_encoding and preferred_encoding == "utf-8" and os.name == 'nt':
|
|
486
|
+
effective_encoding = "cp1252"
|
|
487
|
+
print(f"Warning: Aggressively switching encoding from {preferred_encoding} to {effective_encoding} to prevent deep file UnicodeDecodeError common in Windows CSVs.")
|
|
488
|
+
|
|
489
|
+
if not effective_encoding:
|
|
490
|
+
raise FileProcessingError(f"Could not decode CSV file {file_path} using any of the tested encodings: {', '.join(encodings_to_try)}")
|
|
491
|
+
|
|
492
|
+
# 3. Main Splitting Logic: Open the file again using the confirmed or aggressively set encoding
|
|
493
|
+
# CRITICAL FIX: Add errors='replace' to the main open call. This guarantees
|
|
494
|
+
# that if any remaining problematic bytes are found deep in the file, Python
|
|
495
|
+
# replaces them with a safe character instead of crashing the process.
|
|
496
|
+
with open(input_file, "r", newline="", encoding=effective_encoding, errors='replace') as f:
|
|
497
|
+
reader = csv.reader(f)
|
|
498
|
+
try:
|
|
499
|
+
header = next(reader)
|
|
500
|
+
except StopIteration:
|
|
501
|
+
# Should not happen if Step 2 passed, but defensive coding is included.
|
|
502
|
+
return output_files
|
|
503
|
+
|
|
504
|
+
# Calculate header size (using UTF-8 as the target output encoding for consistency)
|
|
505
|
+
header_bytes = len(",".join(header).encode("utf-8"))
|
|
506
|
+
|
|
507
|
+
part = 1
|
|
508
|
+
rows = []
|
|
509
|
+
current_size = header_bytes
|
|
510
|
+
|
|
511
|
+
# The row iteration now runs with the errors='replace' safety net from 'f'
|
|
512
|
+
for row in reader:
|
|
513
|
+
try:
|
|
514
|
+
# Calculate row size based on the output encoding (UTF-8)
|
|
515
|
+
row_bytes = len(",".join(row).encode("utf-8"))
|
|
516
|
+
except Exception as e:
|
|
517
|
+
print(f"Warning: Skipping problematic row during size calculation: {e}")
|
|
518
|
+
continue
|
|
519
|
+
|
|
520
|
+
if row_bytes > safe_limit:
|
|
521
|
+
raise FileProcessingError(f"A CSV row exceeds the safe max size ({safe_limit} bytes) at part {part}")
|
|
522
|
+
|
|
523
|
+
if current_size + row_bytes > safe_limit:
|
|
524
|
+
# Write the current chunk
|
|
525
|
+
out_path = output_dir / f"{base_fname}_part_{part}{ext}"
|
|
526
|
+
with open(out_path, "w", newline="", encoding="utf-8") as out:
|
|
527
|
+
writer = csv.writer(out)
|
|
528
|
+
writer.writerow(header)
|
|
529
|
+
writer.writerows(rows)
|
|
530
|
+
|
|
531
|
+
output_files.append(str(out_path))
|
|
532
|
+
|
|
533
|
+
# Reset for the next chunk
|
|
534
|
+
part += 1
|
|
535
|
+
rows = []
|
|
536
|
+
current_size = header_bytes
|
|
537
|
+
|
|
538
|
+
# Add the row to the current chunk
|
|
539
|
+
rows.append(row)
|
|
540
|
+
current_size += row_bytes
|
|
541
|
+
|
|
542
|
+
# Write the final chunk
|
|
543
|
+
if rows:
|
|
544
|
+
out_path = output_dir / f"{base_fname}_part_{part}{ext}"
|
|
545
|
+
with open(out_path, "w", newline="", encoding="utf-8") as out:
|
|
546
|
+
writer = csv.writer(out)
|
|
547
|
+
writer.writerow(header)
|
|
548
|
+
writer.writerows(rows)
|
|
549
|
+
output_files.append(str(out_path))
|
|
550
|
+
|
|
551
|
+
return output_files
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: waveflowdb_client
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
|
|
5
5
|
Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
|
|
6
6
|
License: MIT License
|
|
@@ -27,7 +27,7 @@ License: MIT License
|
|
|
27
27
|
|
|
28
28
|
Project-URL: Homepage, https://agentanalytics.ai
|
|
29
29
|
Project-URL: Documentation, https://www.agentanalytics.ai/docs/waveflow-db
|
|
30
|
-
Keywords: vector db,VECTOR QUERY LANGUAGE,
|
|
30
|
+
Keywords: vector db,VECTOR QUERY LANGUAGE,waveflowdb,agentanalytics,VQL
|
|
31
31
|
Requires-Python: >=3.8
|
|
32
32
|
Description-Content-Type: text/markdown
|
|
33
33
|
License-File: LICENSE
|
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import csv
|
|
3
|
-
import logging
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
import json
|
|
7
|
-
import traceback
|
|
8
|
-
import time
|
|
9
|
-
from typing import List
|
|
10
|
-
# Heavy libs are imported at runtime when needed in real environments.
|
|
11
|
-
|
|
12
|
-
from .exceptions import FileProcessingError
|
|
13
|
-
|
|
14
|
-
logger = logging.getLogger(__name__)
|
|
15
|
-
logging.basicConfig(level=logging.INFO)
|
|
16
|
-
|
|
17
|
-
class FileProcessor:
|
|
18
|
-
SUPPORTED_EXTENSIONS = ['txt', 'csv', 'json', 'py', 'docx', 'pdf']
|
|
19
|
-
|
|
20
|
-
@staticmethod
|
|
21
|
-
def read_file_content(filepath: str) -> str:
|
|
22
|
-
ext = filepath.lower().split('.')[-1]
|
|
23
|
-
if ext in ['txt', 'csv', 'py', 'json']:
|
|
24
|
-
with open(filepath, encoding='utf-8') as f:
|
|
25
|
-
return f.read()
|
|
26
|
-
# Defer complex parsing to runtime imports to avoid import-time failures.
|
|
27
|
-
if ext == 'docx':
|
|
28
|
-
try:
|
|
29
|
-
from docx import Document
|
|
30
|
-
except Exception as e:
|
|
31
|
-
raise FileProcessingError(f"python-docx not available: {e}")
|
|
32
|
-
doc = Document(filepath)
|
|
33
|
-
return '\n'.join(p.text for p in doc.paragraphs)
|
|
34
|
-
if ext == 'pdf':
|
|
35
|
-
try:
|
|
36
|
-
import PyPDF2
|
|
37
|
-
except Exception as e:
|
|
38
|
-
raise FileProcessingError(f"PyPDF2 not available: {e}")
|
|
39
|
-
with open(filepath, 'rb') as f:
|
|
40
|
-
reader = PyPDF2.PdfReader(f)
|
|
41
|
-
return '\n'.join(p.extract_text() or '' for p in reader.pages)
|
|
42
|
-
raise FileProcessingError(f'Unsupported extension: {ext}')
|
|
43
|
-
|
|
44
|
-
@staticmethod
|
|
45
|
-
def get_file_size(filepath: str) -> int:
|
|
46
|
-
return os.path.getsize(filepath)
|
|
47
|
-
|
|
48
|
-
@staticmethod
|
|
49
|
-
def is_supported_file(filename: str) -> bool:
|
|
50
|
-
ext = filename.lower().split('.')[-1]
|
|
51
|
-
return ext in FileProcessor.SUPPORTED_EXTENSIONS
|
|
52
|
-
|
|
53
|
-
class Logger:
|
|
54
|
-
def __init__(self, log_dir: str):
|
|
55
|
-
Path(log_dir).mkdir(parents=True, exist_ok=True)
|
|
56
|
-
self.skipped_log = Path(log_dir) / 'skipped_files_log.csv'
|
|
57
|
-
self.api_error_log = Path(log_dir) / 'api_error_log.csv'
|
|
58
|
-
self.performance_log = Path(log_dir) / 'api_performance_log.csv'
|
|
59
|
-
|
|
60
|
-
def _write_csv_log(self, path: Path, header: List[str], row: List):
|
|
61
|
-
exists = path.exists()
|
|
62
|
-
with open(path, 'a', newline='', encoding='utf-8') as f:
|
|
63
|
-
writer = csv.writer(f)
|
|
64
|
-
if not exists:
|
|
65
|
-
writer.writerow(header)
|
|
66
|
-
writer.writerow(row)
|
|
67
|
-
|
|
68
|
-
def log_skipped_file(self, filename: str, reason: str):
|
|
69
|
-
self._write_csv_log(self.skipped_log, ['ts', 'filename', 'reason'], [datetime.utcnow().isoformat(), filename, reason])
|
|
70
|
-
|
|
71
|
-
def log_api_error(self, operation: str, batch_num: int, error_message: str):
|
|
72
|
-
self._write_csv_log(self.api_error_log, ['ts', 'operation', 'batch_num', 'err'], [datetime.utcnow().isoformat(), operation, batch_num, error_message])
|
|
73
|
-
|
|
74
|
-
def log_performance(self, operation=None, batch_num=None, latency=None,
|
|
75
|
-
request_size=None, response_size=None, result_count=None,
|
|
76
|
-
files_processed=None, error: Exception = None):
|
|
77
|
-
"""
|
|
78
|
-
SAFE CSV logger for all SDK operations.
|
|
79
|
-
It will never throw TypeError regardless of input type.
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
logfile = getattr(self, "perf_csv", "performance_logs.csv")
|
|
83
|
-
|
|
84
|
-
# --- Safe stringify for ANY type ---
|
|
85
|
-
def safe_str(value):
|
|
86
|
-
try:
|
|
87
|
-
if isinstance(value, (list, tuple, set)):
|
|
88
|
-
return ",".join(str(v) for v in value)
|
|
89
|
-
return str(value) if value is not None else ""
|
|
90
|
-
except Exception:
|
|
91
|
-
return "UNSERIALIZABLE"
|
|
92
|
-
|
|
93
|
-
row = {
|
|
94
|
-
"timestamp": datetime.utcnow().isoformat(),
|
|
95
|
-
"operation": safe_str(operation),
|
|
96
|
-
"batch_num": safe_str(batch_num),
|
|
97
|
-
"latency_ms": safe_str(latency),
|
|
98
|
-
"request_size": safe_str(request_size),
|
|
99
|
-
"response_size": safe_str(response_size),
|
|
100
|
-
"result_count": safe_str(result_count),
|
|
101
|
-
"files_processed": safe_str(files_processed),
|
|
102
|
-
"error": safe_str(error),
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
try:
|
|
106
|
-
file_exists = os.path.isfile(logfile)
|
|
107
|
-
|
|
108
|
-
with open(logfile, mode="a", newline="", encoding="utf-8") as f:
|
|
109
|
-
writer = csv.DictWriter(f, fieldnames=row.keys())
|
|
110
|
-
|
|
111
|
-
if not file_exists:
|
|
112
|
-
writer.writeheader()
|
|
113
|
-
|
|
114
|
-
writer.writerow(row)
|
|
115
|
-
|
|
116
|
-
except Exception:
|
|
117
|
-
print("[CSV LOGGING ERROR] Could not write performance log:")
|
|
118
|
-
print(traceback.format_exc())
|
|
119
|
-
|
|
120
|
-
class BatchManager:
|
|
121
|
-
def __init__(self, max_files: int, max_size_mb: int):
|
|
122
|
-
self.max_files = max_files
|
|
123
|
-
self.max_bytes = max_size_mb * 1024 * 1024
|
|
124
|
-
|
|
125
|
-
def create_batches(self, files: List[str], base_path: str) -> List[List[str]]:
|
|
126
|
-
file_info = []
|
|
127
|
-
for f in files:
|
|
128
|
-
p = os.path.join(base_path, f)
|
|
129
|
-
try:
|
|
130
|
-
size = os.path.getsize(p)
|
|
131
|
-
if size <= self.max_bytes:
|
|
132
|
-
file_info.append((f, size))
|
|
133
|
-
except Exception:
|
|
134
|
-
continue
|
|
135
|
-
|
|
136
|
-
batches = []
|
|
137
|
-
cur = []
|
|
138
|
-
cur_size = 0
|
|
139
|
-
for fname, size in file_info:
|
|
140
|
-
if len(cur) >= self.max_files or (cur_size + size) > self.max_bytes:
|
|
141
|
-
if cur:
|
|
142
|
-
batches.append(cur)
|
|
143
|
-
cur = []
|
|
144
|
-
cur_size = 0
|
|
145
|
-
cur.append(fname)
|
|
146
|
-
cur_size += size
|
|
147
|
-
if cur:
|
|
148
|
-
batches.append(cur)
|
|
149
|
-
return batches
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{waveflowdb_client-0.0.4 → waveflowdb_client-1.0.0}/waveflowdb_client.egg-info/top_level.txt
RENAMED
|
File without changes
|