waveflowdb-client 0.0.4__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: waveflowdb_client
3
- Version: 0.0.4
3
+ Version: 1.0.0
4
4
  Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
5
5
  Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
6
6
  License: MIT License
@@ -27,7 +27,7 @@ License: MIT License
27
27
 
28
28
  Project-URL: Homepage, https://agentanalytics.ai
29
29
  Project-URL: Documentation, https://www.agentanalytics.ai/docs/waveflow-db
30
- Keywords: vector db,VECTOR QUERY LANGUAGE,waveflow,agentanalytics,VQL
30
+ Keywords: vector db,VECTOR QUERY LANGUAGE,waveflowdb,agentanalytics,VQL
31
31
  Requires-Python: >=3.8
32
32
  Description-Content-Type: text/markdown
33
33
  License-File: LICENSE
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "waveflowdb_client" # pip install name
7
- version = "0.0.4"
7
+ version = "1.0.0"
8
8
  description = "VectorLake SDK — Deterministic backend engine powering agent workflows"
9
9
  readme = "readme.md"
10
10
  requires-python = ">=3.8"
@@ -14,7 +14,7 @@ authors = [
14
14
  { name = "agentanalytics.ai", email = "nitin@agentanalytics.ai" }
15
15
  ]
16
16
 
17
- keywords = ["vector db", "VECTOR QUERY LANGUAGE", "waveflow", "agentanalytics", "VQL"]
17
+ keywords = ["vector db", "VECTOR QUERY LANGUAGE", "waveflowdb", "agentanalytics", "VQL"]
18
18
 
19
19
  dependencies = [
20
20
  "requests",
@@ -1,4 +1,4 @@
1
- __version__ = "0.0.4"
1
+ __version__ = "1.0.0"
2
2
 
3
3
  from .client import VectorLakeClient
4
4
  from .config import Config
@@ -3,13 +3,12 @@ import logging
3
3
  import json
4
4
  import requests
5
5
  import os
6
- from concurrent.futures import ThreadPoolExecutor, as_completed
7
6
  from typing import List, Optional, Dict, Any
8
7
 
9
8
  from .config import Config
10
9
  from .utils import FileProcessor, Logger, BatchManager
11
- from .exceptions import APIError
12
- from .models import HealthResponse # removed BatchResult import
10
+ from .exceptions import APIError, FileProcessingError
11
+ from .models import HealthResponse
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
  logging.basicConfig(level=logging.INFO)
@@ -24,36 +23,42 @@ class VectorLakeClient:
24
23
  self.logger = Logger(config.log_dir)
25
24
  self.batch_manager = BatchManager(config.max_files_per_batch, config.max_batch_size_mb)
26
25
  self.file_processor = FileProcessor()
27
- self.perf_csv = "performance_logs.csv"
28
26
 
29
27
  def _get_headers(self) -> Dict[str, str]:
30
- return {
31
- 'Content-Type': 'application/json',
32
- 'x-api-key': self.config.api_key
33
- }
28
+ return {'Content-Type': 'application/json', 'x-api-key': self.config.api_key}
34
29
 
35
30
  def _make_request(self, endpoint: str, payload: Dict[str, Any], operation: str = "", batch_num: int = 0) -> Dict[str, Any]:
36
31
  headers = self._get_headers()
37
- request_size = len(json.dumps(payload).encode('utf-8')) / 1024 if payload is not None else 0
32
+ request_size = len(json.dumps(payload).encode('utf-8')) / 1024 if payload else 0
33
+
38
34
  for attempt in range(self.config.max_retries):
39
35
  try:
40
36
  start_time = time.time()
41
37
  response = requests.post(endpoint, json=payload, headers=headers, timeout=self.config.timeout)
42
38
  latency = (time.time() - start_time) * 1000
39
+
43
40
  try:
44
41
  result = response.json()
45
42
  except Exception:
46
43
  result = {"status_code": response.status_code, "text": response.text}
47
44
 
48
45
  if operation:
49
- response_size = len(response.content) / 1024 if response.content is not None else 0
46
+ response_size = len(response.content) / 1024 if response.content else 0
50
47
  result_count = len(result.get("results", [])) if isinstance(result, dict) else "N/A"
51
- self.logger.log_performance(operation, batch_num, latency, request_size, response_size, result_count)
48
+ self.logger.log_performance(
49
+ operation=operation,
50
+ batch_num=batch_num,
51
+ latency=latency,
52
+ request_size=request_size,
53
+ response_size=response_size,
54
+ result_count=result_count
55
+ )
52
56
 
53
57
  if response.status_code >= 400:
54
58
  raise APIError(result.get('message', f'HTTP {response.status_code}'), status_code=response.status_code, response_text=response.text)
55
59
 
56
60
  return result
61
+
57
62
  except requests.exceptions.RequestException as e:
58
63
  if attempt == self.config.max_retries - 1:
59
64
  error_msg = f"Request failed after {self.config.max_retries} attempts: {str(e)}"
@@ -62,19 +67,38 @@ class VectorLakeClient:
62
67
  raise APIError(error_msg, getattr(e.response, 'status_code', None), getattr(e.response, 'text', None))
63
68
  time.sleep(2 ** attempt)
64
69
 
65
- def _read_files(self, filenames: List[str]) -> List[str]:
70
+ def _make_request_with_backoff(self, endpoint, payload, operation, batch_num, retries=5, base_delay=1):
71
+ delay = base_delay
72
+ for attempt in range(retries):
73
+ try:
74
+ return self._make_request(endpoint, payload, operation, batch_num)
75
+ except APIError as e:
76
+ if getattr(e, "status_code", None) == 429:
77
+ logging.warning(f"Batch {batch_num} throttled, retrying in {delay}s...")
78
+ time.sleep(delay)
79
+ delay *= 2
80
+ continue
81
+ raise
82
+ except Exception:
83
+ raise
84
+
85
+ def _read_files(self, filenames: List[str], chunks_dir: Optional[str] = None) -> List[str]:
86
+ """
87
+ Reads files safely. If chunks_dir is provided, reads files from that folder.
88
+ """
66
89
  contents = []
67
- for filename in filenames:
68
- filepath = os.path.join(self.config.vector_lake_path, filename)
90
+ for fname in filenames:
91
+ path_base = chunks_dir if chunks_dir else self.config.vector_lake_path
92
+ filepath = os.path.join(path_base, fname)
69
93
  try:
70
- if self.file_processor.is_supported_file(filename):
94
+ if self.file_processor.is_supported_file(fname):
71
95
  content = self.file_processor.read_file_content(filepath)
72
96
  contents.append(content)
73
97
  else:
74
- self.logger.log_skipped_file(filename, "Unsupported file type")
98
+ self.logger.log_skipped_file(fname, "Unsupported file type")
75
99
  contents.append("")
76
100
  except Exception as e:
77
- self.logger.log_skipped_file(filename, f"Read error: {str(e)}")
101
+ self.logger.log_skipped_file(fname, f"Read error: {str(e)}")
78
102
  contents.append("")
79
103
  return contents
80
104
 
@@ -106,36 +130,23 @@ class VectorLakeClient:
106
130
  "pattern": pattern
107
131
  }
108
132
 
109
- # DIRECT MODE: user provided files_name + files_data -> return raw server response
133
+ # Direct mode
110
134
  if files_name and files_data:
111
135
  if len(files_name) != len(files_data):
112
136
  raise ValueError("files_name and files_data must be same length")
113
-
114
- # ensure names are sanitized (no full paths)
115
137
  clean_names = [os.path.basename(n) for n in files_name]
116
- payload.update({
117
- "files_name": clean_names,
118
- "files_data": files_data,
119
- "pattern": "dynamic"
120
- })
138
+ payload.update({"files_name": clean_names, "files_data": files_data, "pattern": "dynamic"})
121
139
  return self._make_request(endpoint, payload, endpoint_key)
122
140
 
123
- # DYNAMIC MODE: read from local filesystem, then return raw server response
141
+ # Dynamic mode: read from filesystem
124
142
  if pattern == "dynamic" and files:
125
- # sanitize file names (server expects names, not full paths)
126
- clean_names = [os.path.basename(f) for f in files]
127
- file_contents = self._read_files(clean_names) if all(os.path.exists(os.path.join(self.config.vector_lake_path, os.path.basename(f))) for f in files) else self._read_files(files)
128
- payload.update({
129
- "files_name": clean_names,
130
- "files_data": file_contents,
131
- "pattern": "dynamic"
132
- })
133
-
134
- # debug print — remove or comment out in prod if not needed
135
- logging.debug("MATCHING DOCS DYNAMIC PAYLOAD: %s", json.dumps(payload, indent=2)[:2000])
143
+ batches, chunks_dir = self.batch_manager.create_batches(files, self.config.vector_lake_path)
144
+ flat_files = [fname for batch in batches for fname in batch]
145
+ file_contents = self._read_files(flat_files, chunks_dir)
146
+ payload.update({"files_name": flat_files, "files_data": file_contents, "pattern": "dynamic"})
136
147
  return self._make_request(endpoint, payload, endpoint_key)
137
148
 
138
- # STATIC MODE: just run the request and return raw response
149
+ # Static mode
139
150
  return self._make_request(endpoint, payload, endpoint_key)
140
151
 
141
152
  def add_documents(self,
@@ -149,11 +160,9 @@ class VectorLakeClient:
149
160
  files_data: Optional[List[str]] = None,
150
161
  max_workers=5) -> Any:
151
162
  """
152
- Direct mode returns raw server response.
153
- Filesystem batch mode returns a 'batch' envelope.
163
+ Add documents either in direct mode (names + data) or batch mode (filesystem).
154
164
  """
155
-
156
- # Direct mode: user supplied names + data -> single request (raw server response)
165
+ # Direct mode
157
166
  if files_name and files_data:
158
167
  if len(files_name) != len(files_data):
159
168
  raise ValueError("files_name and files_data must be same length")
@@ -166,13 +175,12 @@ class VectorLakeClient:
166
175
  "intelligent_segmentation": intelligent_segmentation
167
176
  }
168
177
  endpoint = self.config.endpoints["add_docs"]
169
- # batch_num is OK for add_docs (server batch logging); set to 1 for this single request
170
- result = self._make_request(endpoint, payload, "add_docs", batch_num=1)
171
- return result
178
+ return self._make_request(endpoint, payload, "add_docs", batch_num=1)
172
179
 
173
- # Batch mode: filesystem-driven -> process in batches and return envelope
180
+ # Batch mode
174
181
  return self._process_files_in_batches(
175
- "add_docs", user_id, vector_lake_description, start_from_batch, intelligent_segmentation, session_id, files, max_workers=max_workers
182
+ "add_docs", user_id, vector_lake_description, start_from_batch,
183
+ intelligent_segmentation, session_id, files, max_workers=max_workers
176
184
  )
177
185
 
178
186
  def refresh_documents(self,
@@ -186,16 +194,11 @@ class VectorLakeClient:
186
194
  files_data: Optional[List[str]] = None,
187
195
  max_workers=5) -> Any:
188
196
  """
189
- Same semantics as add_documents:
190
- - Direct mode returns raw server response
191
- - Batch mode returns batch envelope
197
+ Same semantics as add_documents
192
198
  """
193
-
194
- # Direct mode
195
199
  if files_name and files_data:
196
200
  if len(files_name) != len(files_data):
197
201
  raise ValueError("files_name and files_data must be same length")
198
-
199
202
  payload = {
200
203
  "session_id": session_id,
201
204
  "user_id": user_id,
@@ -205,23 +208,130 @@ class VectorLakeClient:
205
208
  "intelligent_segmentation": intelligent_segmentation
206
209
  }
207
210
  endpoint = self.config.endpoints["refresh_docs"]
208
- result = self._make_request(endpoint, payload, "refresh_docs", batch_num=1)
209
- return result
211
+ return self._make_request(endpoint, payload, "refresh_docs", batch_num=1)
210
212
 
211
- # Batch mode — NOTE: call using positional 'operation' arg (operation first)
212
213
  return self._process_files_in_batches(
213
- "refresh_docs", user_id, vector_lake_description, start_from_batch, intelligent_segmentation, session_id, files, max_workers=max_workers
214
+ "refresh_docs", user_id, vector_lake_description, start_from_batch,
215
+ intelligent_segmentation, session_id, files, max_workers=max_workers
214
216
  )
215
217
 
216
- def health_check(self, user_id: str, vector_lake_description: str, session_id: Optional[str] = None) -> Dict[str, Any]:
218
+ def health_check(
219
+ self,
220
+ user_id: str,
221
+ vector_lake_description: str,
222
+ session_id: Optional[str] = None
223
+ ) -> Dict[str, Any]:
217
224
  endpoint = self.config.endpoints["health"]
218
- payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description}
225
+
226
+ payload = {
227
+ "user_id": user_id,
228
+ "vector_lake_description": vector_lake_description,
229
+ "session_id": session_id
230
+ }
231
+
219
232
  try:
220
- result = self._make_request(endpoint, payload, "health")
221
- return HealthResponse(status="success", message=result.get("message", "ok"), timestamp=time.time(), details=result)
233
+ result = self._make_request(
234
+ endpoint=endpoint,
235
+ payload=payload,
236
+ operation="health"
237
+ )
238
+
239
+ return result
240
+
222
241
  except Exception as e:
223
- return HealthResponse(status="error", message=str(e), timestamp=time.time())
242
+ return {
243
+ "status": "error",
244
+ "message": str(e),
245
+ "timestamp": time.time()
246
+ }
247
+
248
+ def _process_files_in_batches(self, operation: str, user_id: str, vector_lake_description: str,
249
+ start_from_batch: int = 1, intelligent_segmentation: bool = False,
250
+ session_id: Optional[str] = None, files: Optional[List[str]] = None,
251
+ max_workers: int = 5, batch_delay: float = 2) -> dict:
252
+ """
253
+ Processes files from the filesystem in batches using BatchManager.
254
+ Returns a standardized envelope.
255
+ """
256
+ base_path = self.config.vector_lake_path
257
+
258
+ # 1. Gather files if not provided
259
+ if files is None:
260
+ files = [f for f in os.listdir(base_path)
261
+ if os.path.isfile(os.path.join(base_path, f)) and self.file_processor.is_supported_file(f)]
224
262
 
263
+ # 2. Create batches using BatchManager
264
+ batches, chunks_dir = self.batch_manager.create_batches(files, base_path)
265
+ logging.info(f"Batches created: {batches}, chunks_dir: {chunks_dir}")
266
+
267
+ batch_outputs = []
268
+ start_index = start_from_batch - 1
269
+ endpoint = self.config.endpoints[operation]
270
+
271
+ for i, batch in enumerate(batches):
272
+ batch_num = i + 1
273
+
274
+ # Skip batches if resuming
275
+ if i < start_index:
276
+ logging.info(f"Skipping batch {batch_num}")
277
+ continue
278
+
279
+ start_time = time.time()
280
+
281
+ try:
282
+ # 3. Read file contents from chunks folder
283
+ file_contents = []
284
+ for fname in batch:
285
+ full_path = os.path.join(chunks_dir, fname)
286
+ if not os.path.exists(full_path):
287
+ full_path = os.path.join(base_path, fname)
288
+ content = self.file_processor.read_file_content(full_path)
289
+ file_contents.append(content)
290
+
291
+ payload = {
292
+ "session_id": session_id,
293
+ "user_id": user_id,
294
+ "vector_lake_description": vector_lake_description,
295
+ "files_name": batch, # API expects basenames
296
+ "files_data": file_contents,
297
+ "intelligent_segmentation": intelligent_segmentation
298
+ }
299
+ # logging.info(f"payload is {payload}")
300
+ # 4. Make request with backoff
301
+ result = self._make_request_with_backoff(endpoint, payload, operation, batch_num)
302
+
303
+ processing_time = time.time() - start_time
304
+ logging.info(f"Batch {batch_num} done")
305
+
306
+ batch_outputs.append({
307
+ "batch_number": batch_num,
308
+ "files": batch,
309
+ "success": True,
310
+ "processing_time": round(processing_time, 3),
311
+ "response": result
312
+ })
313
+
314
+ except Exception as e:
315
+ processing_time = time.time() - start_time
316
+ logging.error(f"Batch {batch_num} failed: {str(e)}")
317
+ batch_outputs.append({
318
+ "batch_number": batch_num,
319
+ "files": batch,
320
+ "success": False,
321
+ "processing_time": round(processing_time, 3),
322
+ "response": None,
323
+ "error": str(e)
324
+ })
325
+
326
+ # 5. Delay between batches
327
+ time.sleep(batch_delay)
328
+
329
+ return {
330
+ "mode": "batch",
331
+ "total_batches": len(batches),
332
+ "batches": sorted(batch_outputs, key=lambda x: x["batch_number"])
333
+ }
334
+
225
335
  def get_namespace_details(self, user_id: str, session_id: Optional[str] = None, vector_lake_description: Optional[str] = None) -> Dict[str, Any]:
226
336
  endpoint = self.config.endpoints["get_namespace_details_by_userid"]
227
337
  payload = {"session_id": session_id, "user_id": user_id}
@@ -252,84 +362,3 @@ class VectorLakeClient:
252
362
  return result
253
363
  except Exception as e:
254
364
  return {"status": "error", "message": str(e)}
255
-
256
- def _make_request_with_backoff(self, endpoint, payload, operation, batch_num, retries=5, base_delay=1):
257
- delay = base_delay
258
- for attempt in range(retries):
259
- try:
260
- result = self._make_request(endpoint, payload, operation, batch_num)
261
- return result
262
- except APIError as e:
263
- if getattr(e, "status_code", None) == 429:
264
- logging.warning(f"Batch {batch_num} throttled, retrying in {delay}s...")
265
- time.sleep(delay)
266
- delay *= 2
267
- continue
268
- raise
269
- except Exception:
270
- raise
271
-
272
- def _process_files_in_batches(self, operation: str, user_id: str, vector_lake_description: str, start_from_batch, intelligent_segmentation: bool = False, session_id: Optional[str] = None, files: Optional[List[str]] = None, max_workers: int = 1, batch_delay: float = 2) -> Dict[str, Any]:
273
- """
274
- Processes files from the filesystem in batches and returns a standardized
275
- envelope:
276
- {
277
- "mode": "batch",
278
- "total_batches": N,
279
- "batches": [
280
- { "batch_number": i, "files": [...], "success": True, "processing_time": x, "response": {...} },
281
- ...
282
- ]
283
- }
284
- """
285
- if files is None:
286
- files = [f for f in os.listdir(self.config.vector_lake_path) if os.path.isfile(os.path.join(self.config.vector_lake_path, f)) and self.file_processor.is_supported_file(f)]
287
-
288
- batches = self.batch_manager.create_batches(files, self.config.vector_lake_path)
289
- batch_outputs: List[Dict[str, Any]] = []
290
- start_batch_index = start_from_batch - 1
291
- if start_from_batch > 1:
292
- logging.info(f"Resuming from batch {start_from_batch}, skipping first {start_from_batch - 1} batches")
293
-
294
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
295
- futures = {}
296
- for i, batch in enumerate(batches):
297
- if i < start_batch_index:
298
- logging.info(f"Skipping batch {i + 1}")
299
- continue
300
- batch_num = i + 1
301
- file_contents = self._read_files(batch)
302
- payload = {"session_id": session_id, "user_id": user_id, "vector_lake_description": vector_lake_description, "files_name": batch, "files_data": file_contents, "intelligent_segmentation": intelligent_segmentation}
303
- endpoint = self.config.endpoints[operation]
304
- futures[executor.submit(self._make_request_with_backoff, endpoint, payload, operation, batch_num)] = (batch_num, batch, time.time())
305
- time.sleep(batch_delay)
306
-
307
- for future in as_completed(futures):
308
- batch_num, batch, start_time = futures[future]
309
- processing_time = time.time() - start_time
310
- try:
311
- result = future.result()
312
- logging.info(f"Batch {batch_num} done")
313
- batch_outputs.append({
314
- "batch_number": batch_num,
315
- "files": batch,
316
- "success": True,
317
- "processing_time": round(processing_time, 3),
318
- "response": result
319
- })
320
- except Exception as e:
321
- logging.error(f"Batch {batch_num} failed: {str(e)}")
322
- batch_outputs.append({
323
- "batch_number": batch_num,
324
- "files": batch,
325
- "success": False,
326
- "processing_time": round(processing_time, 3),
327
- "response": None,
328
- "error": str(e)
329
- })
330
-
331
- return {
332
- "mode": "batch",
333
- "total_batches": len(batch_outputs),
334
- "batches": sorted(batch_outputs, key=lambda x: x["batch_number"])
335
- }
@@ -8,10 +8,10 @@ class Config:
8
8
  self,
9
9
  api_key: Optional[str] = None,
10
10
  host: str = "https://waveflow-analytics.com",
11
- timeout: int = 60,
11
+ timeout: int = 240,
12
12
  max_retries: int = 2,
13
13
  max_files_per_batch: int = 100,
14
- max_batch_size_mb: int = 20,
14
+ max_batch_size_mb: int = 1,
15
15
  vector_lake_path: str = "upload",
16
16
  log_dir: str = "logs",
17
17
  service_port: int = None,
@@ -0,0 +1,551 @@
1
+ import os
2
+ import csv
3
+ import logging
4
+ from pathlib import Path
5
+ from datetime import datetime
6
+ import json
7
+ import shutil
8
+ from typing import List,Tuple
9
+
10
+ from .exceptions import FileProcessingError
11
+
12
+ # logger = logging.getLogger(__name__)
13
+ # logging.basicConfig(level=logging.INFO)
14
+
15
+ import logging
16
+
17
+ # Configure the logging settings
18
+ logging.basicConfig(
19
+ level=logging.DEBUG, # Set to INFO to see only high-level actions, or DEBUG for detail
20
+ format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
21
+ filename='batch_creation.log', # Log output to a file
22
+ filemode='w' # Overwrite the log file each run
23
+ )
24
+
25
+ # If you also want output to the console:
26
+ console_handler = logging.StreamHandler()
27
+ console_handler.setLevel(logging.INFO)
28
+ formatter = logging.Formatter('%(levelname)s: %(message)s')
29
+ console_handler.setFormatter(formatter)
30
+ logging.getLogger().addHandler(console_handler)
31
+
32
+ # Get the logger instance for your class (if needed, though basicConfig covers the root)
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # -------------------------------------------------------------------------
36
+ # UNIVERSAL HELPERS FOR FILE CONVERSION & CHUNKING
37
+ # -------------------------------------------------------------------------
38
+
39
+ def paragraph_chunk(text: str, max_chars: int) -> list:
40
+ """
41
+ Strict paragraph-based chunking.
42
+ No paragraph is ever split.
43
+ """
44
+ paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
45
+ chunks = []
46
+ current = ""
47
+
48
+ for p in paragraphs:
49
+ if len(current) + len(p) + 2 > max_chars:
50
+ if current:
51
+ chunks.append(current.strip())
52
+ current = p + "\n\n"
53
+ else:
54
+ current += p + "\n\n"
55
+
56
+ if current.strip():
57
+ chunks.append(current.strip())
58
+
59
+ return chunks
60
+
61
+
62
+ def json_to_csv(json_path: str) -> str:
63
+ out_path = json_path.replace(".json", ".csv")
64
+
65
+ with open(json_path, "r", encoding="utf-8") as f:
66
+ data = json.load(f)
67
+
68
+ if not isinstance(data, list):
69
+ raise FileProcessingError("JSON must be an array of objects.")
70
+
71
+ # Extract all keys across JSON objects
72
+ headers = sorted({k for row in data for k in row.keys()})
73
+
74
+ with open(out_path, "w", newline="", encoding="utf-8") as csvfile:
75
+ writer = csv.DictWriter(csvfile, fieldnames=headers)
76
+ writer.writeheader()
77
+ writer.writerows(data)
78
+
79
+ return out_path
80
+
81
+
82
+ def convert_to_txt(file_path: str) -> str:
83
+ base, _ = os.path.splitext(file_path)
84
+ out_path = base + ".txt"
85
+
86
+ if file_path.endswith(".py"):
87
+ with open(file_path, "r", encoding="utf-8") as f:
88
+ content = f.read()
89
+
90
+ elif file_path.endswith(".ipynb"):
91
+ import json
92
+ with open(file_path, "r", encoding="utf-8") as f:
93
+ nb = json.load(f)
94
+ content = "\n\n".join(
95
+ cell["source"] if isinstance(cell["source"], str)
96
+ else "".join(cell["source"])
97
+ for cell in nb.get("cells", []) if cell.get("cell_type") == "code"
98
+ )
99
+
100
+ else:
101
+ raise FileProcessingError("Unsupported file for text conversion")
102
+
103
+ with open(out_path, "w", encoding="utf-8") as out:
104
+ out.write(content)
105
+
106
+ return out_path
107
+
108
+
109
+ def extract_text(file_path: str) -> str:
110
+ """
111
+ Extract text from PDF, DOCX, or TXT.
112
+ """
113
+ ext = file_path.lower().split(".")[-1]
114
+
115
+ if ext == "pdf":
116
+ try:
117
+ import PyPDF2
118
+ except Exception as e:
119
+ raise FileProcessingError(f"PyPDF2 not available: {e}")
120
+ text = ""
121
+ with open(file_path, "rb") as f:
122
+ reader = PyPDF2.PdfReader(f)
123
+ for p in reader.pages:
124
+ text += (p.extract_text() or "") + "\n\n"
125
+ return text
126
+
127
+ if ext == "docx":
128
+ try:
129
+ from docx import Document
130
+ except Exception as e:
131
+ raise FileProcessingError(f"python-docx not available: {e}")
132
+ doc = Document(file_path)
133
+ return "\n\n".join(p.text for p in doc.paragraphs)
134
+
135
+ if ext == "txt":
136
+ with open(file_path, "r", encoding="utf-8") as f:
137
+ return f.read()
138
+
139
+ raise FileProcessingError("Unsupported extraction type.")
140
+
141
+
142
+ def split_csv_if_needed(file_path: str, max_bytes: int) -> list:
143
+ """
144
+ Split CSV into smaller parts using 20% safety buffer.
145
+ No partial rows.
146
+ """
147
+ SAFE_LIMIT = int(max_bytes * 0.8)
148
+ size = os.path.getsize(file_path)
149
+
150
+ if size <= SAFE_LIMIT:
151
+ return [file_path]
152
+
153
+ base, ext = os.path.splitext(file_path)
154
+ output_files = []
155
+
156
+ with open(file_path, "r", newline="", encoding="utf-8") as f:
157
+ reader = csv.reader(f)
158
+ header = next(reader)
159
+ header_bytes = len(",".join(header).encode("utf-8"))
160
+
161
+ part = 1
162
+ rows = []
163
+ current_size = header_bytes
164
+
165
+ for row in reader:
166
+ row_bytes = len(",".join(row).encode("utf-8"))
167
+
168
+ if row_bytes > SAFE_LIMIT:
169
+ raise FileProcessingError(f"A CSV row exceeds the safe max size.")
170
+
171
+ if current_size + row_bytes > SAFE_LIMIT:
172
+ out_file = f"{base}_part_{part}{ext}"
173
+ with open(out_file, "w", newline="", encoding="utf-8") as out:
174
+ writer = csv.writer(out)
175
+ writer.writerow(header)
176
+ writer.writerows(rows)
177
+ output_files.append(out_file)
178
+
179
+ part += 1
180
+ rows = []
181
+ current_size = header_bytes
182
+
183
+ rows.append(row)
184
+ current_size += row_bytes
185
+
186
+ # last chunk
187
+ if rows:
188
+ out_file = f"{base}_part_{part}{ext}"
189
+ with open(out_file, "w", newline="", encoding="utf-8") as out:
190
+ writer = csv.writer(out)
191
+ writer.writerow(header)
192
+ writer.writerows(rows)
193
+ output_files.append(out_file)
194
+
195
+ return output_files
196
+
197
+
198
+ # -------------------------------------------------------------------------
199
+ # FILE PROCESSOR
200
+ # -------------------------------------------------------------------------
201
+
202
+ class FileProcessor:
203
+ SUPPORTED_EXTENSIONS = ['txt', 'csv', 'json', 'py', 'docx', 'pdf', 'ipynb']
204
+
205
+ @staticmethod
206
+ def read_file_content(filepath: str) -> str:
207
+ ext = filepath.lower().split('.')[-1]
208
+ if ext in ['txt', 'csv', 'py', 'json']:
209
+ with open(filepath, encoding='utf-8') as f:
210
+ return f.read()
211
+ if ext == 'docx':
212
+ from docx import Document
213
+ doc = Document(filepath)
214
+ return '\n'.join(p.text for p in doc.paragraphs)
215
+ if ext == 'pdf':
216
+ import PyPDF2
217
+ with open(filepath, 'rb') as f:
218
+ reader = PyPDF2.PdfReader(f)
219
+ return '\n'.join(p.extract_text() or '' for p in reader.pages)
220
+ raise FileProcessingError(f'Unsupported extension: {ext}')
221
+
222
+ @staticmethod
223
+ def get_file_size(filepath: str) -> int:
224
+ return os.path.getsize(filepath)
225
+
226
+ @staticmethod
227
+ def is_supported_file(filename: str) -> bool:
228
+ ext = filename.lower().split('.')[-1]
229
+ return ext in FileProcessor.SUPPORTED_EXTENSIONS
230
+
231
+
232
+ # -------------------------------------------------------------------------
233
+ # LOGGER
234
+ # -------------------------------------------------------------------------
235
+
236
+ class Logger:
237
+ def __init__(self, log_dir: str):
238
+ Path(log_dir).mkdir(parents=True, exist_ok=True)
239
+ self.skipped_log = Path(log_dir) / 'skipped_files_log.csv'
240
+ self.api_error_log = Path(log_dir) / 'api_error_log.csv'
241
+ self.performance_log = Path(log_dir) / 'api_performance_log.csv'
242
+
243
+ def _write_csv_log(self, path: Path, header: List[str], row: List):
244
+ exists = path.exists()
245
+ with open(path, 'a', newline='', encoding='utf-8') as f:
246
+ writer = csv.writer(f)
247
+ if not exists:
248
+ writer.writerow(header)
249
+ writer.writerow(row)
250
+
251
+ def log_skipped_file(self, filename: str, reason: str):
252
+ self._write_csv_log(self.skipped_log,
253
+ ['ts', 'filename', 'reason'],
254
+ [datetime.utcnow().isoformat(), filename, reason])
255
+
256
+ def log_api_error(self, operation: str, batch_num: int, error_message: str):
257
+ self._write_csv_log(self.api_error_log,
258
+ ['ts', 'operation', 'batch_num', 'err'],
259
+ [datetime.utcnow().isoformat(), operation, batch_num, error_message])
260
+
261
+ def log_performance(self, **kwargs):
262
+ """
263
+ SAFE CSV logger for performance metrics.
264
+ Accepts ANY keyword args and serializes them safely.
265
+ """
266
+ logfile = self.performance_log
267
+
268
+ # Start row with timestamp
269
+ row = {"timestamp": datetime.utcnow().isoformat()}
270
+
271
+ # Safely encode all user-provided fields
272
+ for k, v in kwargs.items():
273
+ try:
274
+ if isinstance(v, (list, tuple, set)):
275
+ row[k] = ",".join(str(i) for i in v)
276
+ else:
277
+ row[k] = str(v) if v is not None else ""
278
+ except Exception:
279
+ row[k] = "UNSERIALIZABLE"
280
+
281
+ # Write row
282
+ file_exists = logfile.exists()
283
+ with open(logfile, "a", newline="", encoding="utf-8") as f:
284
+ writer = csv.DictWriter(f, fieldnames=row.keys())
285
+ if not file_exists:
286
+ writer.writeheader()
287
+ writer.writerow(row)
288
+
289
+
290
+ # -------------------------------------------------------------------------
291
+ # BATCH MANAGER (UPDATED)
292
+ # -------------------------------------------------------------------------
293
+ class BatchManager:
294
+ def __init__(self, max_files: int, max_size_mb: int):
295
+ self.max_files = max_files
296
+ self.max_bytes = max_size_mb * 1024 * 1024 # convert MB to bytes
297
+
298
+
299
+ def create_batches(self, files: List[str], base_path: str) -> Tuple[List[List[str]], str]:
300
+ """
301
+ Process files into batches based on max files & max bytes, with detailed logging.
302
+ """
303
+
304
+ # Log initial parameters
305
+ logger.info(f"Starting batch creation. Max files: {self.max_files}, Max bytes: {self.max_bytes}")
306
+
307
+ SAFE_LIMIT = int(self.max_bytes * 0.8)
308
+ logger.debug(f"Calculated SAFE_LIMIT (80% of max_bytes): {SAFE_LIMIT} bytes")
309
+
310
+ chunks_dir = os.path.join(base_path, "chunks")
311
+ os.makedirs(chunks_dir, exist_ok=True)
312
+ logger.info(f"Chunks directory created: {chunks_dir}")
313
+
314
+ file_info = []
315
+
316
+ # -----------------------------
317
+ # Step 1: Preprocess files
318
+ # -----------------------------
319
+ logger.info("--- Step 1: Preprocessing and Chunking Files ---")
320
+
321
+ for f in files:
322
+ filepath = os.path.join(base_path, f)
323
+
324
+ if not os.path.isfile(filepath):
325
+ logger.warning(f"File not found, skipping: {filepath}")
326
+ continue
327
+
328
+ ext = f.lower().split(".")[-1]
329
+ logger.debug(f"Processing file: {f}, detected extension: {ext}")
330
+
331
+ # JSON → CSV
332
+ if ext == "json":
333
+ original_path = filepath
334
+ filepath = json_to_csv(filepath)
335
+ ext = "csv"
336
+ logger.info(f"Converted JSON file {original_path} to CSV at {filepath}")
337
+
338
+ # PY/IPYNB → TXT
339
+ elif ext in ("py", "ipynb"):
340
+ original_path = filepath
341
+ filepath = convert_to_txt(filepath)
342
+ ext = "txt"
343
+ logger.info(f"Converted code file {original_path} to TXT at {filepath}")
344
+
345
+ # PDF/DOCX/TXT → paragraph chunks if needed
346
+ if ext in ("pdf", "docx", "txt") and ext != "csv":
347
+ text = extract_text(filepath)
348
+ chunks = paragraph_chunk(text, max_chars=SAFE_LIMIT)
349
+ base_fname, _ = os.path.splitext(os.path.basename(filepath))
350
+ part_files = []
351
+
352
+ for i, block in enumerate(chunks, 1):
353
+ out_file = os.path.join(chunks_dir, f"{base_fname}_part_{i}.txt")
354
+ with open(out_file, "w", encoding="utf-8") as out:
355
+ out.write(block)
356
+ size = os.path.getsize(out_file)
357
+ part_files.append((out_file, size))
358
+ logger.debug(f"Chunked file {f} into part {i}: {os.path.basename(out_file)}, Size: {size} bytes")
359
+
360
+ file_info.extend(part_files)
361
+ logger.info(f"Successfully chunked {f} into {len(chunks)} parts.")
362
+ continue
363
+
364
+ # CSV → safe split
365
+ if ext == "csv":
366
+ parts = self._split_csv_safe(filepath, SAFE_LIMIT, chunks_dir)
367
+ sizes = [os.path.getsize(p) for p in parts]
368
+ file_info.extend(list(zip(parts, sizes)))
369
+ logger.info(f"Successfully split CSV file {f} into {len(parts)} safe chunks.")
370
+ continue
371
+
372
+ # Other small files → move to chunks folder or chunk if large
373
+ size = os.path.getsize(filepath)
374
+ if size > SAFE_LIMIT:
375
+ logger.warning(f"File {f} ({size} bytes) exceeds SAFE_LIMIT ({SAFE_LIMIT} bytes). Attempting generic TXT chunking.")
376
+ try:
377
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f_in:
378
+ content = f_in.read()
379
+
380
+ chunks = paragraph_chunk(content, max_chars=SAFE_LIMIT)
381
+ base_fname, _ = os.path.splitext(os.path.basename(filepath))
382
+ for i, block in enumerate(chunks, 1):
383
+ out_file = os.path.join(chunks_dir, f"{base_fname}_part_{i}.txt")
384
+ with open(out_file, "w", encoding="utf-8") as out:
385
+ out.write(block)
386
+ file_info.append((out_file, os.path.getsize(out_file)))
387
+ logger.debug(f"Generic chunked file {f} into part {i}: {os.path.basename(out_file)}")
388
+
389
+ except Exception as e:
390
+ logger.error(f"Failed to chunk large file {filepath} as TXT. Error: {e}")
391
+ raise FileProcessingError(f"File {filepath} exceeds batch limit and cannot be chunked")
392
+
393
+ else:
394
+ # Copy small files to chunks directory
395
+ new_path = os.path.join(chunks_dir, os.path.basename(filepath))
396
+ if not os.path.exists(new_path):
397
+ shutil.copy2(filepath, new_path)
398
+ file_info.append((new_path, size))
399
+ logger.debug(f"Copied small file {f} to chunks_dir. Size: {size} bytes.")
400
+
401
+ logger.info(f"Preprocessing complete. Total items ready for batching: {len(file_info)}")
402
+
403
+ # -----------------------------
404
+ # Step 2: Create dynamic batches
405
+ # -----------------------------
406
+ logger.info("--- Step 2: Creating Dynamic Batches ---")
407
+ batches = []
408
+ cur_batch = []
409
+ cur_size = 0
410
+
411
+ for filepath, size in file_info:
412
+ filename = os.path.basename(filepath)
413
+ logger.debug(f"Considering file: {filename}, Size: {size} bytes. Current batch size: {cur_size} bytes, files: {len(cur_batch)}")
414
+
415
+ # Absolute safety check (shouldn't happen if Step 1 worked)
416
+ if size > self.max_bytes:
417
+ logger.critical(f"Chunked file {filename} ({size} bytes) still exceeds max batch size ({self.max_bytes} bytes).")
418
+ raise FileProcessingError(f"File {filepath} exceeds max batch size ({self.max_bytes} bytes)")
419
+
420
+ # Flush current batch if limits exceeded
421
+ if cur_batch and (len(cur_batch) >= self.max_files or (cur_size + size) > self.max_bytes):
422
+ logger.info(f"Batch limit hit. Flushing batch with {len(cur_batch)} files and {cur_size} bytes.")
423
+ batches.append(cur_batch)
424
+ cur_batch = []
425
+ cur_size = 0
426
+
427
+ cur_batch.append(filename)
428
+ cur_size += size
429
+ logger.debug(f"Added file {filename}. New batch size: {cur_size} bytes, files: {len(cur_batch)}")
430
+
431
+ if cur_batch:
432
+ logger.info(f"Flushing final batch with {len(cur_batch)} files and {cur_size} bytes.")
433
+ batches.append(cur_batch)
434
+
435
+ logger.info(f"Batch creation complete. Total batches created: {len(batches)}")
436
+
437
+ return batches, chunks_dir
438
+
439
+ def _split_csv_safe(self, file_path: str, safe_limit: int, chunks_dir: str, preferred_encoding: str = "utf-8") -> List[str]:
440
+ """
441
+ Split CSV safely into chunks respecting row boundaries.
442
+ Tries multiple common encodings if the preferred one fails, ensuring robustness
443
+ across different operating systems and file origins (e.g., handling cp1252/Latin-1).
444
+ Includes an aggressive fallback to 'cp1252' if 'utf-8' is suspected of failing deep within the file.
445
+ Uses 'errors='replace'' in the final read step for maximum stability against corrupted bytes.
446
+
447
+ Returns a list of chunk file paths inside chunks_dir.
448
+ """
449
+ input_file = Path(file_path)
450
+ output_dir = Path(chunks_dir)
451
+ output_files = []
452
+
453
+ # 1. Ensure the output directory exists
454
+ output_dir.mkdir(parents=True, exist_ok=True)
455
+
456
+ base_fname = input_file.stem
457
+ ext = input_file.suffix
458
+
459
+ # 2. Robust Encoding Handling: Determine the correct encoding
460
+ encodings_to_try = [preferred_encoding, "cp1252", "latin-1"]
461
+ effective_encoding = None
462
+
463
+ for encoding in encodings_to_try:
464
+ try:
465
+ # Attempt to open and read only the header to test the encoding.
466
+ # We do not use 'errors' here to ensure we only accept encodings that cleanly read the header.
467
+ with open(input_file, "r", newline="", encoding=encoding) as f_test:
468
+ test_reader = csv.reader(f_test)
469
+ # Attempt to read the first row (header)
470
+ header_test = next(test_reader)
471
+
472
+ effective_encoding = encoding
473
+ break
474
+ except UnicodeDecodeError:
475
+ print(f"Failed to decode with {encoding}. Trying next encoding...")
476
+ continue
477
+ except StopIteration:
478
+ raise FileProcessingError(f"CSV file {file_path} is empty.")
479
+ except Exception as e:
480
+ raise FileProcessingError(f"Error opening or reading file {file_path}: {e}")
481
+
482
+ # AGGRESSIVE FALLBACK LOGIC:
483
+ # If the file appears to be UTF-8 based on the header check, but is running on a Windows environment ('nt'),
484
+ # we preemptively switch to cp1252 to handle deep file corruption caused by non-UTF-8 characters.
485
+ if effective_encoding == preferred_encoding and preferred_encoding == "utf-8" and os.name == 'nt':
486
+ effective_encoding = "cp1252"
487
+ print(f"Warning: Aggressively switching encoding from {preferred_encoding} to {effective_encoding} to prevent deep file UnicodeDecodeError common in Windows CSVs.")
488
+
489
+ if not effective_encoding:
490
+ raise FileProcessingError(f"Could not decode CSV file {file_path} using any of the tested encodings: {', '.join(encodings_to_try)}")
491
+
492
+ # 3. Main Splitting Logic: Open the file again using the confirmed or aggressively set encoding
493
+ # CRITICAL FIX: Add errors='replace' to the main open call. This guarantees
494
+ # that if any remaining problematic bytes are found deep in the file, Python
495
+ # replaces them with a safe character instead of crashing the process.
496
+ with open(input_file, "r", newline="", encoding=effective_encoding, errors='replace') as f:
497
+ reader = csv.reader(f)
498
+ try:
499
+ header = next(reader)
500
+ except StopIteration:
501
+ # Should not happen if Step 2 passed, but defensive coding is included.
502
+ return output_files
503
+
504
+ # Calculate header size (using UTF-8 as the target output encoding for consistency)
505
+ header_bytes = len(",".join(header).encode("utf-8"))
506
+
507
+ part = 1
508
+ rows = []
509
+ current_size = header_bytes
510
+
511
+ # The row iteration now runs with the errors='replace' safety net from 'f'
512
+ for row in reader:
513
+ try:
514
+ # Calculate row size based on the output encoding (UTF-8)
515
+ row_bytes = len(",".join(row).encode("utf-8"))
516
+ except Exception as e:
517
+ print(f"Warning: Skipping problematic row during size calculation: {e}")
518
+ continue
519
+
520
+ if row_bytes > safe_limit:
521
+ raise FileProcessingError(f"A CSV row exceeds the safe max size ({safe_limit} bytes) at part {part}")
522
+
523
+ if current_size + row_bytes > safe_limit:
524
+ # Write the current chunk
525
+ out_path = output_dir / f"{base_fname}_part_{part}{ext}"
526
+ with open(out_path, "w", newline="", encoding="utf-8") as out:
527
+ writer = csv.writer(out)
528
+ writer.writerow(header)
529
+ writer.writerows(rows)
530
+
531
+ output_files.append(str(out_path))
532
+
533
+ # Reset for the next chunk
534
+ part += 1
535
+ rows = []
536
+ current_size = header_bytes
537
+
538
+ # Add the row to the current chunk
539
+ rows.append(row)
540
+ current_size += row_bytes
541
+
542
+ # Write the final chunk
543
+ if rows:
544
+ out_path = output_dir / f"{base_fname}_part_{part}{ext}"
545
+ with open(out_path, "w", newline="", encoding="utf-8") as out:
546
+ writer = csv.writer(out)
547
+ writer.writerow(header)
548
+ writer.writerows(rows)
549
+ output_files.append(str(out_path))
550
+
551
+ return output_files
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: waveflowdb_client
3
- Version: 0.0.4
3
+ Version: 1.0.0
4
4
  Summary: VectorLake SDK — Deterministic backend engine powering agent workflows
5
5
  Author-email: "agentanalytics.ai" <nitin@agentanalytics.ai>
6
6
  License: MIT License
@@ -27,7 +27,7 @@ License: MIT License
27
27
 
28
28
  Project-URL: Homepage, https://agentanalytics.ai
29
29
  Project-URL: Documentation, https://www.agentanalytics.ai/docs/waveflow-db
30
- Keywords: vector db,VECTOR QUERY LANGUAGE,waveflow,agentanalytics,VQL
30
+ Keywords: vector db,VECTOR QUERY LANGUAGE,waveflowdb,agentanalytics,VQL
31
31
  Requires-Python: >=3.8
32
32
  Description-Content-Type: text/markdown
33
33
  License-File: LICENSE
@@ -1,149 +0,0 @@
1
- import os
2
- import csv
3
- import logging
4
- from pathlib import Path
5
- from datetime import datetime
6
- import json
7
- import traceback
8
- import time
9
- from typing import List
10
- # Heavy libs are imported at runtime when needed in real environments.
11
-
12
- from .exceptions import FileProcessingError
13
-
14
- logger = logging.getLogger(__name__)
15
- logging.basicConfig(level=logging.INFO)
16
-
17
- class FileProcessor:
18
- SUPPORTED_EXTENSIONS = ['txt', 'csv', 'json', 'py', 'docx', 'pdf']
19
-
20
- @staticmethod
21
- def read_file_content(filepath: str) -> str:
22
- ext = filepath.lower().split('.')[-1]
23
- if ext in ['txt', 'csv', 'py', 'json']:
24
- with open(filepath, encoding='utf-8') as f:
25
- return f.read()
26
- # Defer complex parsing to runtime imports to avoid import-time failures.
27
- if ext == 'docx':
28
- try:
29
- from docx import Document
30
- except Exception as e:
31
- raise FileProcessingError(f"python-docx not available: {e}")
32
- doc = Document(filepath)
33
- return '\n'.join(p.text for p in doc.paragraphs)
34
- if ext == 'pdf':
35
- try:
36
- import PyPDF2
37
- except Exception as e:
38
- raise FileProcessingError(f"PyPDF2 not available: {e}")
39
- with open(filepath, 'rb') as f:
40
- reader = PyPDF2.PdfReader(f)
41
- return '\n'.join(p.extract_text() or '' for p in reader.pages)
42
- raise FileProcessingError(f'Unsupported extension: {ext}')
43
-
44
- @staticmethod
45
- def get_file_size(filepath: str) -> int:
46
- return os.path.getsize(filepath)
47
-
48
- @staticmethod
49
- def is_supported_file(filename: str) -> bool:
50
- ext = filename.lower().split('.')[-1]
51
- return ext in FileProcessor.SUPPORTED_EXTENSIONS
52
-
53
- class Logger:
54
- def __init__(self, log_dir: str):
55
- Path(log_dir).mkdir(parents=True, exist_ok=True)
56
- self.skipped_log = Path(log_dir) / 'skipped_files_log.csv'
57
- self.api_error_log = Path(log_dir) / 'api_error_log.csv'
58
- self.performance_log = Path(log_dir) / 'api_performance_log.csv'
59
-
60
- def _write_csv_log(self, path: Path, header: List[str], row: List):
61
- exists = path.exists()
62
- with open(path, 'a', newline='', encoding='utf-8') as f:
63
- writer = csv.writer(f)
64
- if not exists:
65
- writer.writerow(header)
66
- writer.writerow(row)
67
-
68
- def log_skipped_file(self, filename: str, reason: str):
69
- self._write_csv_log(self.skipped_log, ['ts', 'filename', 'reason'], [datetime.utcnow().isoformat(), filename, reason])
70
-
71
- def log_api_error(self, operation: str, batch_num: int, error_message: str):
72
- self._write_csv_log(self.api_error_log, ['ts', 'operation', 'batch_num', 'err'], [datetime.utcnow().isoformat(), operation, batch_num, error_message])
73
-
74
- def log_performance(self, operation=None, batch_num=None, latency=None,
75
- request_size=None, response_size=None, result_count=None,
76
- files_processed=None, error: Exception = None):
77
- """
78
- SAFE CSV logger for all SDK operations.
79
- It will never throw TypeError regardless of input type.
80
- """
81
-
82
- logfile = getattr(self, "perf_csv", "performance_logs.csv")
83
-
84
- # --- Safe stringify for ANY type ---
85
- def safe_str(value):
86
- try:
87
- if isinstance(value, (list, tuple, set)):
88
- return ",".join(str(v) for v in value)
89
- return str(value) if value is not None else ""
90
- except Exception:
91
- return "UNSERIALIZABLE"
92
-
93
- row = {
94
- "timestamp": datetime.utcnow().isoformat(),
95
- "operation": safe_str(operation),
96
- "batch_num": safe_str(batch_num),
97
- "latency_ms": safe_str(latency),
98
- "request_size": safe_str(request_size),
99
- "response_size": safe_str(response_size),
100
- "result_count": safe_str(result_count),
101
- "files_processed": safe_str(files_processed),
102
- "error": safe_str(error),
103
- }
104
-
105
- try:
106
- file_exists = os.path.isfile(logfile)
107
-
108
- with open(logfile, mode="a", newline="", encoding="utf-8") as f:
109
- writer = csv.DictWriter(f, fieldnames=row.keys())
110
-
111
- if not file_exists:
112
- writer.writeheader()
113
-
114
- writer.writerow(row)
115
-
116
- except Exception:
117
- print("[CSV LOGGING ERROR] Could not write performance log:")
118
- print(traceback.format_exc())
119
-
120
- class BatchManager:
121
- def __init__(self, max_files: int, max_size_mb: int):
122
- self.max_files = max_files
123
- self.max_bytes = max_size_mb * 1024 * 1024
124
-
125
- def create_batches(self, files: List[str], base_path: str) -> List[List[str]]:
126
- file_info = []
127
- for f in files:
128
- p = os.path.join(base_path, f)
129
- try:
130
- size = os.path.getsize(p)
131
- if size <= self.max_bytes:
132
- file_info.append((f, size))
133
- except Exception:
134
- continue
135
-
136
- batches = []
137
- cur = []
138
- cur_size = 0
139
- for fname, size in file_info:
140
- if len(cur) >= self.max_files or (cur_size + size) > self.max_bytes:
141
- if cur:
142
- batches.append(cur)
143
- cur = []
144
- cur_size = 0
145
- cur.append(fname)
146
- cur_size += size
147
- if cur:
148
- batches.append(cur)
149
- return batches