signalpilot-ai-internal 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of signalpilot-ai-internal might be problematic. Click here for more details.

Files changed (46) hide show
  1. signalpilot_ai_internal/_version.py +1 -1
  2. signalpilot_ai_internal/cache_service.py +152 -1
  3. signalpilot_ai_internal/file_scanner_service.py +1252 -0
  4. signalpilot_ai_internal/handlers.py +262 -0
  5. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json +2 -2
  6. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig +1 -1
  7. signalpilot_ai_internal-0.6.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/188.1ace26ac1a5e246783bb.js +1 -0
  8. signalpilot_ai_internal-0.6.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/839.7d9a99d0566aa6743c69.js +1 -0
  9. signalpilot_ai_internal-0.6.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/898.4e9edb7f224152c1dcb4.js +2 -0
  10. signalpilot_ai_internal-0.6.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.410a42566793b732952f.js +1 -0
  11. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/third-party-licenses.json +0 -6
  12. {signalpilot_ai_internal-0.5.0.dist-info → signalpilot_ai_internal-0.6.0.dist-info}/METADATA +3 -1
  13. signalpilot_ai_internal-0.6.0.dist-info/RECORD +46 -0
  14. signalpilot_ai_internal-0.5.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/104.04e170724f369fcbaf19.js +0 -2
  15. signalpilot_ai_internal-0.5.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/104.04e170724f369fcbaf19.js.LICENSE.txt +0 -24
  16. signalpilot_ai_internal-0.5.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/188.e781cc4c87f2dbf290ec.js +0 -1
  17. signalpilot_ai_internal-0.5.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/606.90aaaae46b73dc3c08fb.js +0 -1
  18. signalpilot_ai_internal-0.5.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/839.aebe246bc24f6809f864.js +0 -1
  19. signalpilot_ai_internal-0.5.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/898.5251a593584dd5d131d5.js +0 -2
  20. signalpilot_ai_internal-0.5.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.bd213fdf0227c652e1a8.js +0 -1
  21. signalpilot_ai_internal-0.5.0.dist-info/RECORD +0 -48
  22. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/etc/jupyter/jupyter_server_config.d/signalpilot_ai.json +0 -0
  23. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/install.json +0 -0
  24. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/plugin.json +0 -0
  25. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.e2dadf63dc64d7b5f1ee.js +0 -0
  26. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.328403b5545f268b95c6.js +0 -0
  27. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.726e1da31a50868cb297.js +0 -0
  28. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.72484b768a04f89bd3dd.js +0 -0
  29. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.dbec4c2dc12e7b050dcc.js +0 -0
  30. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.fa432bdb7fb6b1c95ad6.js +0 -0
  31. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.37e271d7a80336daabe2.js +0 -0
  32. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.9b4f05a99f5003f82094.js +0 -0
  33. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.73c7a9290b7d35a8b9c1.js +0 -0
  34. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.b58fc0093d080b8ee61c.js +0 -0
  35. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js +0 -0
  36. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js.LICENSE.txt +0 -0
  37. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.9720593ee20b768da3ca.js +0 -0
  38. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.8e6edc9a965bdd578ca7.js +0 -0
  39. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.dc49867fafb03ea2ba4d.js +0 -0
  40. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/742.91e7b516c8699eea3373.js +0 -0
  41. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.3aa564fc148b37d1d719.js +0 -0
  42. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/888.34054db17bcf6e87ec95.js +0 -0
  43. /signalpilot_ai_internal-0.5.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/898.5251a593584dd5d131d5.js.LICENSE.txt → /signalpilot_ai_internal-0.6.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/898.4e9edb7f224152c1dcb4.js.LICENSE.txt +0 -0
  44. {signalpilot_ai_internal-0.5.0.data → signalpilot_ai_internal-0.6.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/style.js +0 -0
  45. {signalpilot_ai_internal-0.5.0.dist-info → signalpilot_ai_internal-0.6.0.dist-info}/WHEEL +0 -0
  46. {signalpilot_ai_internal-0.5.0.dist-info → signalpilot_ai_internal-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1252 @@
1
+ """
2
+ File Scanner Service for SignalPilot AI.
3
+ Handles file scanning, schema extraction, and directory tracking.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import hashlib
9
+ import threading
10
+ import asyncio
11
+ from datetime import datetime, timedelta
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional, Any, Tuple
14
+ import pandas as pd
15
+ import numpy as np
16
+ import concurrent.futures
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ import pyarrow.dataset as ds
19
+ from openpyxl import load_workbook
20
+
21
+ from .cache_service import get_cache_service, get_file_scan_cache_manager
22
+
23
+
24
+ class FileScannerService:
25
+ """Service for scanning directories and extracting file schemas"""
26
+
27
+ def __init__(self):
28
+ self.cache_service = get_cache_service()
29
+ self.file_scan_cache = get_file_scan_cache_manager()
30
+ self._lock = threading.RLock()
31
+
32
+ # Data file extensions
33
+ self.DATA_EXTENSIONS = {'.csv', '.json', '.xlsx', '.xls', '.parquet', '.pkl', '.pickle',
34
+ '.feather', '.hdf5', '.h5', '.sql', '.db', '.sqlite', '.tsv', '.txt'}
35
+
36
+ # Directories to exclude from search
37
+ self.EXCLUDE_DIRS = {'.git', '.ipynb_checkpoints', 'node_modules', '__pycache__',
38
+ '.venv', 'venv', 'env', '.pytest_cache', '.mypy_cache',
39
+ 'dist', 'build', '.tox', 'logs', '.vscode'}
40
+
41
+ # Thread pool for async schema extraction
42
+ self._executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="schema_extractor")
43
+
44
+ # Dedicated thread pool for directory scanning operations (I/O blocking)
45
+ self._dir_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="dir_scanner")
46
+
47
+ async def _list_directory_async(self, directory_path: Path) -> List[Path]:
48
+ """List directory contents using dedicated thread pool to avoid blocking."""
49
+ loop = asyncio.get_running_loop()
50
+ return await loop.run_in_executor(self._dir_executor, lambda: list(directory_path.iterdir()))
51
+
52
+ def __del__(self):
53
+ """Cleanup thread pools when service is destroyed."""
54
+ if hasattr(self, '_dir_executor'):
55
+ self._dir_executor.shutdown(wait=False)
56
+ if hasattr(self, '_executor'):
57
+ self._executor.shutdown(wait=False)
58
+
59
+ def _get_directory_hash(self, directory: str) -> str:
60
+ """Generate a hash for a directory path"""
61
+ return hashlib.md5(directory.encode()).hexdigest()[:16]
62
+
63
+
64
+ def _is_binary_file(self, filepath: str, chunk_size: int = 512) -> bool:
65
+ """Ultra-fast binary file detection with minimal I/O"""
66
+ try:
67
+ with open(filepath, 'rb') as f:
68
+ chunk = f.read(chunk_size)
69
+ if not chunk:
70
+ return False
71
+ # Fast null byte check - if any null bytes, it's binary
72
+ if b'\x00' in chunk:
73
+ return True
74
+ # Quick printable ratio check using bytes directly
75
+ printable = sum(1 for b in chunk if 32 <= b <= 126 or b in (9, 10, 13))
76
+ return (printable / len(chunk)) < 0.7
77
+ except (IOError, OSError):
78
+ return True
79
+
80
+ def _generate_pickle_data_preview(self, data: Any, max_items: int = 3, max_chars: int = 1000) -> Tuple[str, bool]:
81
+ """
82
+ Generate a content preview for non-DataFrame pickle data.
83
+ Returns (preview_content, is_truncated)
84
+ """
85
+ try:
86
+ data_type = type(data).__name__
87
+
88
+ if isinstance(data, (list, tuple)):
89
+ if len(data) == 0:
90
+ return f"Empty {data_type}", False
91
+
92
+ preview_items = []
93
+ for i, item in enumerate(data[:max_items]):
94
+ item_str = str(item)
95
+ if len(item_str) > 200:
96
+ item_str = item_str[:200] + "..."
97
+ preview_items.append(f"[{i}]: {item_str}")
98
+
99
+ preview = f"{data_type} with {len(data)} items:\n" + "\n".join(preview_items)
100
+ is_truncated = len(data) > max_items
101
+
102
+ if len(preview) > max_chars:
103
+ preview = preview[:max_chars] + "..."
104
+ is_truncated = True
105
+
106
+ return preview, is_truncated
107
+
108
+ elif isinstance(data, dict):
109
+ if len(data) == 0:
110
+ return f"Empty {data_type}", False
111
+
112
+ preview_items = []
113
+ for i, (key, value) in enumerate(list(data.items())[:max_items]):
114
+ key_str = str(key)
115
+ value_str = str(value)
116
+ if len(value_str) > 150:
117
+ value_str = value_str[:150] + "..."
118
+ preview_items.append(f"'{key_str}': {value_str}")
119
+
120
+ preview = f"{data_type} with {len(data)} keys:\n" + "\n".join(preview_items)
121
+ is_truncated = len(data) > max_items
122
+
123
+ if len(preview) > max_chars:
124
+ preview = preview[:max_chars] + "..."
125
+ is_truncated = True
126
+
127
+ return preview, is_truncated
128
+
129
+ elif isinstance(data, np.ndarray):
130
+ shape_str = str(data.shape)
131
+ dtype_str = str(data.dtype)
132
+
133
+ if data.size == 0:
134
+ return f"Empty numpy array: shape={shape_str}, dtype={dtype_str}", False
135
+
136
+ # Show first few elements
137
+ flat_data = data.flatten()[:max_items]
138
+ elements_str = ", ".join([str(x) for x in flat_data])
139
+
140
+ preview = f"numpy.ndarray: shape={shape_str}, dtype={dtype_str}\nFirst elements: [{elements_str}]"
141
+ is_truncated = data.size > max_items
142
+
143
+ if len(preview) > max_chars:
144
+ preview = preview[:max_chars] + "..."
145
+ is_truncated = True
146
+
147
+ return preview, is_truncated
148
+
149
+ elif isinstance(data, str):
150
+ if len(data) == 0:
151
+ return "Empty string", False
152
+
153
+ preview = f"String ({len(data)} chars): {data[:max_chars]}"
154
+ is_truncated = len(data) > max_chars
155
+ return preview, is_truncated
156
+
157
+ elif isinstance(data, (int, float, bool)):
158
+ return f"{data_type}: {data}", False
159
+
160
+ else:
161
+ # For other types, try to convert to string
162
+ data_str = str(data)
163
+ if len(data_str) > max_chars:
164
+ data_str = data_str[:max_chars] + "..."
165
+ is_truncated = True
166
+ else:
167
+ is_truncated = False
168
+
169
+ return f"{data_type}: {data_str}", is_truncated
170
+
171
+ except Exception as e:
172
+ return f"Error generating preview for {type(data).__name__}: {str(e)}", False
173
+
174
+ def _parse_json_array_simple(self, filepath: str, max_items: int = 5) -> Tuple[List[Any], bool]:
175
+ """
176
+ Simple JSON array parsing that reads first chunk and extracts items.
177
+ More robust for very large files.
178
+ Returns (items_list, is_truncated)
179
+ """
180
+ import json
181
+
182
+ try:
183
+ # Read first 50KB of the file
184
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
185
+ chunk = f.read(50000) # 50KB chunk
186
+
187
+ # Try to find the opening bracket
188
+ bracket_pos = chunk.find('[')
189
+ if bracket_pos == -1:
190
+ # Not an array, try as single object
191
+ f.seek(0)
192
+ try:
193
+ single_obj = json.load(f)
194
+ return [single_obj], False
195
+ except:
196
+ return [], False
197
+
198
+ # Find the first few complete JSON objects in the chunk
199
+ items = []
200
+ current_pos = bracket_pos + 1
201
+
202
+ while len(items) < max_items and current_pos < len(chunk):
203
+ # Find the next complete JSON object
204
+ brace_count = 0
205
+ start_pos = current_pos
206
+ in_string = False
207
+ escape_next = False
208
+
209
+ for i in range(current_pos, len(chunk)):
210
+ char = chunk[i]
211
+
212
+ if escape_next:
213
+ escape_next = False
214
+ continue
215
+ elif char == '\\':
216
+ escape_next = True
217
+ continue
218
+ elif char == '"' and not escape_next:
219
+ in_string = not in_string
220
+ elif not in_string:
221
+ if char == '{':
222
+ brace_count += 1
223
+ elif char == '}':
224
+ brace_count -= 1
225
+ if brace_count == 0:
226
+ # Found complete object
227
+ try:
228
+ obj_str = chunk[start_pos:i+1].strip()
229
+ if obj_str.startswith(','):
230
+ obj_str = obj_str[1:].strip()
231
+ if obj_str:
232
+ obj = json.loads(obj_str)
233
+ items.append(obj)
234
+ except:
235
+ pass
236
+ current_pos = i + 1
237
+ break
238
+ elif char == ',' and brace_count == 0 and not in_string:
239
+ # End of current item
240
+ try:
241
+ obj_str = chunk[start_pos:i].strip()
242
+ if obj_str.startswith(','):
243
+ obj_str = obj_str[1:].strip()
244
+ if obj_str:
245
+ obj = json.loads(obj_str)
246
+ items.append(obj)
247
+ except:
248
+ pass
249
+ current_pos = i + 1
250
+ break
251
+ else:
252
+ break
253
+
254
+ # Check if there's more content
255
+ remaining = f.read(1000)
256
+ is_truncated = len(remaining) > 0 or len(items) == max_items
257
+
258
+ return items, is_truncated
259
+
260
+ except Exception:
261
+ return [], False
262
+
263
+ def _read_json_file(self, filepath: str, max_items: int = 5) -> Tuple[List[Any], str, bool]:
264
+ """
265
+ Read JSON file with smart loading strategy.
266
+ Returns (data_list, file_format, is_truncated)
267
+ file_format: 'object', 'array', or 'jsonl'
268
+ """
269
+ import json
270
+
271
+ try:
272
+ # Check file size
273
+ file_size = os.path.getsize(filepath)
274
+ size_mb = file_size / (1024 * 1024)
275
+
276
+ # For small files (< 10MB), use simple json.load()
277
+ if size_mb < 10:
278
+ try:
279
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
280
+ data = json.load(f)
281
+
282
+ if isinstance(data, list):
283
+ return data[:max_items], 'array', len(data) > max_items
284
+ elif isinstance(data, dict):
285
+ return [data], 'object', False
286
+ else:
287
+ return [data], 'primitive', False
288
+ except json.JSONDecodeError:
289
+ # Try as JSONL if JSON parsing fails
290
+ pass
291
+
292
+ # For large files or if JSON parsing failed, try incremental parsing
293
+ try:
294
+ # First check if it's JSONL (line-delimited JSON)
295
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
296
+ first_line = f.readline().strip()
297
+ if first_line and first_line.startswith('{') and first_line.endswith('}'):
298
+ # Likely JSONL format
299
+ f.seek(0)
300
+ items = []
301
+ for i, line in enumerate(f):
302
+ if i >= max_items:
303
+ break
304
+ line = line.strip()
305
+ if line:
306
+ try:
307
+ item = json.loads(line)
308
+ items.append(item)
309
+ except:
310
+ continue
311
+ return items, 'jsonl', True # Assume truncated for large files
312
+
313
+ # Try simple array parsing
314
+ items, is_truncated = self._parse_json_array_simple(filepath, max_items)
315
+ if items:
316
+ return items, 'array', is_truncated
317
+
318
+ # Fallback: try to read first few lines as individual JSON objects
319
+ try:
320
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
321
+ items = []
322
+ for i, line in enumerate(f):
323
+ if i >= max_items:
324
+ break
325
+ line = line.strip()
326
+ if line and (line.startswith('{') or line.startswith('[')):
327
+ try:
328
+ obj = json.loads(line)
329
+ items.append(obj)
330
+ except:
331
+ continue
332
+ if items:
333
+ return items, 'jsonl', True
334
+ except:
335
+ pass
336
+
337
+ # Final fallback: try to read as single object
338
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
339
+ content = f.read(10000) # Read first 10KB
340
+ try:
341
+ data = json.loads(content)
342
+ if isinstance(data, dict):
343
+ return [data], 'object', True
344
+ except:
345
+ pass
346
+
347
+ return [], 'unknown', False
348
+
349
+ except Exception:
350
+ return [], 'unknown', False
351
+
352
+ except Exception:
353
+ return [], 'unknown', False
354
+
355
+ def _infer_json_type(self, value: Any) -> str:
356
+ """Infer data type from JSON value"""
357
+ if value is None:
358
+ return 'null'
359
+ elif isinstance(value, bool):
360
+ return 'boolean'
361
+ elif isinstance(value, int):
362
+ return 'integer'
363
+ elif isinstance(value, float):
364
+ return 'float'
365
+ elif isinstance(value, str):
366
+ # Try to detect if it's a numeric string
367
+ try:
368
+ float(value)
369
+ return 'numeric_string'
370
+ except:
371
+ return 'string'
372
+ elif isinstance(value, list):
373
+ return 'array'
374
+ elif isinstance(value, dict):
375
+ return 'object'
376
+ else:
377
+ return 'unknown'
378
+
379
+ def _extract_keys_from_object(self, obj: dict, prefix: str = "", max_depth: int = 3, max_fields: int = 50) -> Dict[str, Dict[str, Any]]:
380
+ """Recursively extract keys from nested objects with truncation limits"""
381
+ keys = {}
382
+
383
+ if max_depth <= 0 or len(keys) >= max_fields:
384
+ return keys
385
+
386
+ for i, (key, value) in enumerate(obj.items()):
387
+ if len(keys) >= max_fields:
388
+ break
389
+
390
+ full_key = f"{prefix}.{key}" if prefix else key
391
+ value_type = self._infer_json_type(value)
392
+
393
+ # Truncate sample values to 50 characters max
394
+ sample_value = None
395
+ if not isinstance(value, (dict, list)):
396
+ sample_str = str(value)
397
+ sample_value = sample_str[:50] + "..." if len(sample_str) > 50 else sample_str
398
+
399
+ keys[full_key] = {
400
+ 'type': value_type,
401
+ 'sample_value': sample_value,
402
+ 'is_nested': isinstance(value, (dict, list)),
403
+ 'depth': len(prefix.split('.')) if prefix else 0
404
+ }
405
+
406
+ # Recursively extract nested keys (reduced depth)
407
+ if isinstance(value, dict) and max_depth > 1 and len(keys) < max_fields:
408
+ nested_keys = self._extract_keys_from_object(value, full_key, max_depth - 1, max_fields - len(keys))
409
+ keys.update(nested_keys)
410
+ elif isinstance(value, list) and value and isinstance(value[0], dict) and len(keys) < max_fields:
411
+ # For arrays of objects, analyze the first object only
412
+ nested_keys = self._extract_keys_from_object(value[0], f"{full_key}[0]", max_depth - 1, max_fields - len(keys))
413
+ keys.update(nested_keys)
414
+
415
+ return keys
416
+
417
+ def _analyze_json_structure(self, data_list: List[Any], filepath: str, filename: str, current_mtime: float) -> Dict[str, Any]:
418
+ """
419
+ Analyze JSON structure and extract comprehensive schema information.
420
+ """
421
+ if not data_list:
422
+ return {
423
+ 'success': False,
424
+ 'error': f'No data found in JSON file. File may be too large ({os.path.getsize(filepath) / (1024*1024):.1f}MB) or malformed. Try using a smaller sample or check file format.'
425
+ }
426
+
427
+ # Analyze structure
428
+ all_keys = {}
429
+ sample_data = []
430
+ total_records = len(data_list)
431
+ max_depth = 0
432
+ structure_types = set()
433
+
434
+ # Process each record
435
+ for i, record in enumerate(data_list):
436
+ if isinstance(record, dict):
437
+ structure_types.add('object')
438
+ # Extract keys from this record
439
+ record_keys = self._extract_keys_from_object(record)
440
+
441
+ # Merge with all_keys, keeping track of types and sample values
442
+ for key, key_info in record_keys.items():
443
+ if key not in all_keys:
444
+ all_keys[key] = {
445
+ 'type': key_info['type'],
446
+ 'sample_values': [],
447
+ 'is_consistent': True,
448
+ 'depth': key_info['depth']
449
+ }
450
+
451
+ # Add sample value
452
+ if key_info['sample_value']:
453
+ all_keys[key]['sample_values'].append(key_info['sample_value'])
454
+
455
+ # Check type consistency
456
+ if all_keys[key]['type'] != key_info['type']:
457
+ all_keys[key]['is_consistent'] = False
458
+ # Update to most common type or 'mixed'
459
+ all_keys[key]['type'] = 'mixed'
460
+
461
+ # Track max depth
462
+ max_depth = max(max_depth, key_info['depth'])
463
+
464
+ # Add to sample data (first 5 records, truncated)
465
+ if i < 5:
466
+ # Truncate sample data to essential fields only
467
+ truncated_record = {}
468
+ for j, (k, v) in enumerate(record.items()):
469
+ if j >= 5: # Only first 5 fields
470
+ break
471
+ if isinstance(v, (dict, list)):
472
+ truncated_record[k] = f"<{type(v).__name__}>"
473
+ else:
474
+ val_str = str(v)
475
+ truncated_record[k] = val_str[:50] + "..." if len(val_str) > 50 else val_str
476
+ sample_data.append(truncated_record)
477
+
478
+ elif isinstance(record, list):
479
+ structure_types.add('array')
480
+ if i < 3:
481
+ # Truncate arrays to first 5 items
482
+ truncated_array = record[:5]
483
+ sample_data.append(truncated_array)
484
+ else:
485
+ structure_types.add('primitive')
486
+ if i < 3:
487
+ # Truncate primitive values
488
+ val_str = str(record)
489
+ truncated_val = val_str[:50] + "..." if len(val_str) > 50 else val_str
490
+ sample_data.append(truncated_val)
491
+
492
+ # Convert keys to columns format with truncation
493
+ columns = []
494
+ for key, key_info in all_keys.items():
495
+ # Determine final type
496
+ final_type = key_info['type']
497
+ if not key_info['is_consistent']:
498
+ final_type = 'mixed'
499
+
500
+ # Get unique sample values (limit to 2, truncate to 30 chars each)
501
+ unique_samples = []
502
+ for sample in list(set(key_info['sample_values']))[:2]:
503
+ sample_str = str(sample)
504
+ truncated_sample = sample_str[:30] + "..." if len(sample_str) > 30 else sample_str
505
+ unique_samples.append(truncated_sample)
506
+
507
+ columns.append({
508
+ 'name': key,
509
+ 'dataType': final_type,
510
+ 'description': f'Field {key} of type {final_type}',
511
+ 'sample_values': unique_samples,
512
+ 'is_consistent': key_info['is_consistent'],
513
+ 'depth': key_info['depth']
514
+ })
515
+
516
+ # Determine file format description
517
+ if 'object' in structure_types and len(structure_types) == 1:
518
+ format_desc = 'JSON object'
519
+ elif 'array' in structure_types and len(structure_types) == 1:
520
+ format_desc = 'JSON array'
521
+ elif 'jsonl' in str(filepath).lower():
522
+ format_desc = 'Line-delimited JSON (JSONL)'
523
+ else:
524
+ format_desc = 'Mixed JSON structure'
525
+
526
+ # Calculate statistics
527
+ unique_keys_count = len(all_keys)
528
+ consistent_keys = sum(1 for k in all_keys.values() if k['is_consistent'])
529
+
530
+ return {
531
+ 'success': True,
532
+ 'fileId': filepath,
533
+ 'fileName': filename,
534
+ 'filePath': filepath,
535
+ 'fileType': 'json',
536
+ 'extractedAt': datetime.now().isoformat(),
537
+ 'summary': f'{format_desc} with {total_records} record{"s" if total_records != 1 else ""}, {unique_keys_count} unique field{"s" if unique_keys_count != 1 else ""}',
538
+ 'totalRows': total_records,
539
+ 'totalColumns': unique_keys_count,
540
+ 'columns': columns,
541
+ 'sampleData': sample_data,
542
+ 'fileMtime': current_mtime,
543
+ 'structure_info': {
544
+ 'format_type': format_desc,
545
+ 'max_depth': max_depth,
546
+ 'consistent_fields': consistent_keys,
547
+ 'total_fields': unique_keys_count,
548
+ 'structure_types': list(structure_types)
549
+ }
550
+ }
551
+
552
+ def _read_file_preview_optimized(self, filepath: str, max_chars: int = 2000, max_newlines: int = 5) -> Tuple[str, bool]:
553
+ """
554
+ Ultra-fast file preview reader using efficient buffered reading.
555
+ Reads first 2000 characters OR first 3 newlines, whichever comes first.
556
+ Returns (content, is_truncated)
557
+ """
558
+ try:
559
+ file_size = os.path.getsize(filepath)
560
+
561
+ # For very small files, read directly
562
+ if file_size <= max_chars:
563
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
564
+ content = f.read()
565
+ return content, False
566
+
567
+ # For larger files, read in chunks and stop at limits
568
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
569
+ content = f.read(max_chars)
570
+
571
+ # Count newlines in the content we read
572
+ newline_count = content.count('\n')
573
+
574
+ # If we have 3 or fewer newlines, we're good
575
+ if newline_count <= max_newlines:
576
+ # Check if there's more content (for truncation flag)
577
+ next_chunk = f.read(1)
578
+ is_truncated = bool(next_chunk)
579
+ return content, is_truncated
580
+
581
+ # If we have more than 3 newlines, truncate to first 3
582
+ lines = content.split('\n', max_newlines + 1)
583
+ if len(lines) > max_newlines:
584
+ # We have more than max_newlines, so truncate
585
+ truncated_content = '\n'.join(lines[:max_newlines])
586
+ return truncated_content, True
587
+ else:
588
+ # Exactly max_newlines, check if there's more content
589
+ next_chunk = f.read(1)
590
+ is_truncated = bool(next_chunk)
591
+ return content, is_truncated
592
+
593
+ except (UnicodeDecodeError, IOError, OSError):
594
+ try:
595
+ # Fallback for problematic files
596
+ with open(filepath, 'rb') as f:
597
+ raw_bytes = f.read(max_chars)
598
+ content = raw_bytes.decode('utf-8', errors='replace')
599
+ # Apply newline limit to fallback content too
600
+ lines = content.split('\n')
601
+ if len(lines) > max_newlines:
602
+ content = '\n'.join(lines[:max_newlines])
603
+ return content, True
604
+ return content, len(raw_bytes) == max_chars
605
+ except Exception:
606
+ return f"<Error reading file: {filepath}>", False
607
+
608
+ def _get_file_type_info(self, filepath: str, extension: str) -> Dict[str, Any]:
609
+ """Get optimized metadata about file type"""
610
+ file_info = {
611
+ 'extension': extension,
612
+ 'is_csv': extension == '.csv',
613
+ 'is_tsv': extension == '.tsv',
614
+ 'is_json': extension == '.json',
615
+ 'is_parquet': extension == '.parquet',
616
+ 'is_pkl': extension in ['.pkl', '.pickle'],
617
+ 'is_xlsx': extension == '.xlsx',
618
+ 'is_text': extension in ['.txt', '.md', '.py', '.js', '.ts', '.html', '.xml'],
619
+ 'is_data': extension in ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.pkl', '.pickle', '.xlsx'],
620
+ 'is_binary': extension in ['.parquet', '.pkl', '.pickle', '.xlsx'] # Will be set later based on actual binary detection
621
+ }
622
+
623
+ try:
624
+ file_info['size_bytes'] = os.path.getsize(filepath)
625
+ file_info['last_modified'] = datetime.fromtimestamp(os.path.getmtime(filepath)).isoformat()
626
+ except (IOError, OSError):
627
+ file_info['size_bytes'] = 0
628
+ file_info['last_modified'] = None
629
+
630
+ return file_info
631
+
632
+ def _process_csv_preview(self, content: str, filepath: str) -> Dict[str, Any]:
633
+ """Fast CSV preview processing"""
634
+ # Split into lines efficiently, limit to what we need
635
+ newline_pos = content.find('\n')
636
+ if newline_pos == -1:
637
+ # Single line file
638
+ header = content.strip()
639
+ sample_rows = []
640
+ else:
641
+ # Multi-line file - get header and up to 5 sample rows
642
+ lines = content.split('\n', 6) # Get at most 6 lines (header + 5 samples)
643
+ header = lines[0] if lines[0] else None
644
+ sample_rows = [line for line in lines[1:6] if line.strip()]
645
+
646
+ result = {
647
+ 'type': 'csv',
648
+ 'preview_type': 'header_and_sample',
649
+ 'header': header,
650
+ 'sample_rows': sample_rows
651
+ }
652
+
653
+ if header:
654
+ result['estimated_columns'] = header.count(',') + 1
655
+
656
+ return result
657
+
658
+ def _process_json_preview(self, content: str, filepath: str) -> Dict[str, Any]:
659
+ """Enhanced JSON structure analysis with actual parsing"""
660
+ import json
661
+
662
+ result = {
663
+ 'type': 'json',
664
+ 'preview_type': 'parsed_structure'
665
+ }
666
+
667
+ try:
668
+ # Try to parse the content as JSON
669
+ data = json.loads(content)
670
+
671
+ if isinstance(data, dict):
672
+ result['structure'] = 'object'
673
+ result['keys'] = list(data.keys())[:5] # First 5 keys only
674
+ result['key_count'] = len(data.keys())
675
+
676
+ # Analyze key types (truncated)
677
+ key_types = {}
678
+ for i, (key, value) in enumerate(data.items()):
679
+ if i >= 5: # Limit to first 5 keys
680
+ break
681
+ key_types[key] = self._infer_json_type(value)
682
+ result['key_types'] = key_types
683
+
684
+ elif isinstance(data, list):
685
+ result['structure'] = 'array'
686
+ result['item_count'] = len(data)
687
+ result['sample_items'] = data[:3] # First 3 items
688
+
689
+ # Analyze item types
690
+ if data:
691
+ item_types = [self._infer_json_type(item) for item in data[:5]]
692
+ result['item_types'] = item_types
693
+
694
+ # If items are objects, get their keys
695
+ if isinstance(data[0], dict):
696
+ result['sample_keys'] = list(data[0].keys())[:5]
697
+
698
+ else:
699
+ result['structure'] = 'primitive'
700
+ result['value'] = str(data)[:50]
701
+ result['value_type'] = self._infer_json_type(data)
702
+
703
+ except json.JSONDecodeError:
704
+ # Fallback to line-by-line parsing for JSONL
705
+ lines = content.strip().split('\n')
706
+ if lines and lines[0].strip().startswith('{'):
707
+ result['structure'] = 'jsonl'
708
+ result['line_count'] = len(lines)
709
+
710
+ # Try to parse first few lines
711
+ sample_objects = []
712
+ for line in lines[:2]: # Only first 2 lines
713
+ try:
714
+ obj = json.loads(line.strip())
715
+ sample_objects.append(obj)
716
+ except:
717
+ continue
718
+
719
+ if sample_objects:
720
+ result['sample_objects'] = sample_objects
721
+ # Get keys from first object
722
+ if isinstance(sample_objects[0], dict):
723
+ result['sample_keys'] = list(sample_objects[0].keys())[:5]
724
+ else:
725
+ # Fallback to basic analysis
726
+ content_stripped = content.lstrip()
727
+ if not content_stripped:
728
+ result['structure'] = 'empty'
729
+ else:
730
+ first_char = content_stripped[0]
731
+ if first_char == '{':
732
+ result['structure'] = 'object'
733
+ result['estimated_keys'] = content_stripped.count('":')
734
+ elif first_char == '[':
735
+ result['structure'] = 'array'
736
+ result['estimated_items'] = content_stripped.count(',') + 1
737
+ else:
738
+ result['structure'] = 'unknown'
739
+
740
+ except Exception as e:
741
+ result['structure'] = 'error'
742
+ result['error'] = str(e)
743
+
744
+ return result
745
+
746
+ def _get_data_type(self, dtype_str: str) -> str:
747
+ """Map pandas dtypes to readable types"""
748
+ if dtype_str.startswith('int'):
749
+ return 'integer'
750
+ elif dtype_str.startswith('float'):
751
+ return 'float'
752
+ elif dtype_str == 'bool':
753
+ return 'boolean'
754
+ elif dtype_str.startswith('datetime'):
755
+ return 'datetime'
756
+ elif dtype_str == 'object':
757
+ return 'string'
758
+ else:
759
+ return 'string'
760
+
761
+ def _analyze_dataframe(self, df: pd.DataFrame, file_type: str) -> Dict[str, Any]:
762
+ """Analyze DataFrame and return schema information"""
763
+ # Get basic info
764
+ total_rows_sample = len(df)
765
+ total_columns = len(df.columns)
766
+
767
+ # Get column information
768
+ columns = []
769
+ sample_data = []
770
+
771
+ for col in df.columns:
772
+ dtype = str(df[col].dtype)
773
+ data_type = self._get_data_type(dtype)
774
+
775
+ # For object columns, try to infer if it's a date
776
+ if dtype == 'object' and not df[col].dropna().empty:
777
+ sample_val = df[col].dropna().iloc[0]
778
+ try:
779
+ pd.to_datetime(sample_val)
780
+ data_type = 'date'
781
+ except:
782
+ pass
783
+
784
+ columns.append({
785
+ 'name': str(col),
786
+ 'dataType': data_type,
787
+ 'description': f'Column {col} of type {data_type}'
788
+ })
789
+
790
+ # Get sample data (first 5 rows)
791
+ for _, row in df.head(5).iterrows():
792
+ sample_data.append(row.fillna('').astype(str).tolist())
793
+
794
+ return {
795
+ 'success': True,
796
+ 'totalRows': total_rows_sample,
797
+ 'totalColumns': total_columns,
798
+ 'columns': columns,
799
+ 'sampleData': sample_data,
800
+ 'summary': f'{file_type.upper()} file with {total_columns} columns'
801
+ }
802
+
803
+ async def scan_directories(self, paths: List[str], force_refresh: bool = False, workspace_root: str = None) -> Dict[str, Any]:
804
+ """Scan multiple directories and return file information without reprocessing unchanged files.
805
+
806
+ Strategy:
807
+ - Quickly enumerate directory contents (recursively)
808
+ - For files, return cached rich entries if mtime unchanged
809
+ - Otherwise, return a lightweight placeholder and process the file asynchronously to populate cache
810
+ - Schema extraction reuses existing caching/async logic
811
+ """
812
+ all_files: List[Dict[str, Any]] = []
813
+ scanned_directories: List[Dict[str, Any]] = []
814
+ cached_count = 0
815
+
816
+ if workspace_root is None:
817
+ workspace_root = os.getcwd()
818
+ original_root_path = Path(workspace_root)
819
+
820
+ scanned_directories_cache = self.file_scan_cache.get_scanned_directories()
821
+
822
+ for path in paths:
823
+ scanned_directory_cache = next((dir for dir in scanned_directories_cache if dir.get('path') == path), None)
824
+ dir_cached = True
825
+ abs_path = os.path.abspath(os.getcwd() if path in ('.', './') else path)
826
+
827
+ files_for_dir: List[Dict[str, Any]] = []
828
+ base_dir = Path(abs_path)
829
+ if not base_dir.exists() or not base_dir.is_dir():
830
+ scanned_directories.append({
831
+ 'path': abs_path,
832
+ 'file_count': 0,
833
+ 'scanned_at': datetime.now().isoformat(),
834
+ })
835
+ continue
836
+
837
+ # Walk directory tree with shallow Path.iterdir recursion to keep control and avoid reading file contents
838
+ stack: List[Tuple[Path, int]] = [(base_dir, 0)]
839
+ max_depth = 10
840
+ while stack:
841
+ current_dir, depth = stack.pop()
842
+ if depth > max_depth:
843
+ continue
844
+ try:
845
+ items = await self._list_directory_async(current_dir)
846
+ for item in items:
847
+ # Skip hidden and excluded directories
848
+ if item.is_dir():
849
+ if item.name.startswith('.') or item.name in self.EXCLUDE_DIRS:
850
+ continue
851
+ # Create directory entry (simple)
852
+ try:
853
+ relative_path = str(item.relative_to(original_root_path))
854
+ except ValueError:
855
+ relative_path = str(item.name)
856
+ try:
857
+ normalized_path = str(item.resolve().relative_to(original_root_path)) if item.resolve().is_relative_to(original_root_path) else str(item.name)
858
+ except Exception:
859
+ normalized_path = str(item)
860
+ dir_entry = {
861
+ 'id': str(item),
862
+ 'name': item.name,
863
+ 'absolute_path': str(item.absolute()),
864
+ 'path': str(item),
865
+ 'normalized_path': normalized_path,
866
+ 'relative_path': relative_path,
867
+ 'is_directory': True,
868
+ 'file_info': {'is_directory': True}
869
+ }
870
+ files_for_dir.append(dir_entry)
871
+ # Recurse
872
+ stack.append((item, depth + 1))
873
+ continue
874
+
875
+ # File handling
876
+ if item.name.startswith('.'):
877
+ continue
878
+
879
+ abs_file_path = str(item.absolute())
880
+ cached_entry = self.file_scan_cache.get_file_entry(abs_file_path)
881
+
882
+ current_mtime = None
883
+ try:
884
+ current_mtime = os.path.getmtime(abs_file_path)
885
+ except Exception:
886
+ pass
887
+
888
+ # Check if schema extraction has timed out (older than 60 seconds)
889
+ if (cached_entry and isinstance(cached_entry, dict) and
890
+ cached_entry.get('schema') and cached_entry.get('schema', {}).get('started_at')):
891
+ schema_started_at = cached_entry.get('schema', {}).get('started_at')
892
+ try:
893
+ started_time = datetime.fromisoformat(schema_started_at)
894
+ time_diff = datetime.now() - started_time
895
+ if time_diff > timedelta(seconds=60):
896
+ cached_entry['schema'] = {
897
+ 'loading': False,
898
+ 'error': 'Schema extraction timed out',
899
+ }
900
+ self.file_scan_cache.set_file_entry(abs_file_path, cached_entry)
901
+ continue
902
+ except (ValueError, TypeError):
903
+ # If we can't parse the datetime, continue with normal processing
904
+ pass
905
+
906
+ use_cached = False
907
+ if cached_entry and isinstance(cached_entry, dict) and not force_refresh:
908
+ cached_mtime = cached_entry.get('file_mtime') or cached_entry.get('fileMtime')
909
+ schema_info = cached_entry.get('schema')
910
+ if schema_info and isinstance(schema_info, dict) and schema_info.get('loading') is True:
911
+ use_cached = True
912
+ if current_mtime is not None and cached_mtime is not None and abs(float(cached_mtime)) == abs(float(current_mtime)):
913
+ use_cached = True
914
+
915
+ if use_cached:
916
+ entry = dict(cached_entry)
917
+ cached_count += 1
918
+ else:
919
+ dir_cached = False
920
+ # Lightweight placeholder while we process in background
921
+ try:
922
+ relative_path = str(item.relative_to(original_root_path))
923
+ except ValueError:
924
+ relative_path = str(item.name)
925
+ try:
926
+ normalized_path = str(item.resolve().relative_to(original_root_path)) if item.resolve().is_relative_to(original_root_path) else str(item.name)
927
+ except Exception:
928
+ normalized_path = str(item)
929
+ entry = {
930
+ 'id': str(item),
931
+ 'name': item.stem,
932
+ 'absolute_path': abs_file_path,
933
+ 'path': str(item),
934
+ 'normalized_path': normalized_path,
935
+ 'relative_path': relative_path,
936
+ 'is_directory': False,
937
+ 'file_mtime': current_mtime,
938
+ 'schema': {
939
+ 'loading': True,
940
+ 'started_at': datetime.now().isoformat(),
941
+ }
942
+ }
943
+
944
+ self.file_scan_cache.set_file_entry(abs_file_path, entry)
945
+ # Schedule background processing to populate cache
946
+ self._executor.submit(self.extract_schema, abs_file_path)
947
+
948
+ files_for_dir.append(entry)
949
+ except (IOError, OSError, PermissionError):
950
+ continue
951
+
952
+ all_files.extend(files_for_dir)
953
+ if dir_cached and scanned_directory_cache:
954
+ scanned_directories.append(scanned_directory_cache)
955
+ else:
956
+ scanned_directories.append({
957
+ 'path': abs_path,
958
+ 'file_count': len(files_for_dir),
959
+ 'scanned_at': datetime.now().isoformat(),
960
+ })
961
+
962
+ # De-duplicate by absolute path and directory flag
963
+ unique_seen = set()
964
+ deduped_files: List[Dict[str, Any]] = []
965
+ for entry in all_files:
966
+ abs_path_val = entry.get('absolute_path')
967
+ is_dir = bool(entry.get('is_directory'))
968
+ key = (abs_path_val, is_dir)
969
+ if not abs_path_val:
970
+ deduped_files.append(entry)
971
+ continue
972
+ if key in unique_seen:
973
+ continue
974
+ unique_seen.add(key)
975
+ deduped_files.append(entry)
976
+
977
+ return {
978
+ 'files': deduped_files,
979
+ 'scanned_directories': scanned_directories,
980
+ 'cached': cached_count > 0,
981
+ 'total_files': len(deduped_files)
982
+ }
983
+
984
+ def extract_schema(self, file_path: str, force_refresh: bool = False) -> Dict[str, Any]:
985
+ """Extract schema from a file using pandas"""
986
+ try:
987
+ # Convert to absolute path
988
+ abs_path = file_path
989
+
990
+ # Get current mtime for caching
991
+ current_mtime = None
992
+ try:
993
+ current_mtime = os.path.getmtime(abs_path)
994
+ except Exception:
995
+ pass
996
+
997
+ item = Path(abs_path)
998
+
999
+ # Determine file type first
1000
+ extension = Path(abs_path).suffix.lower()
1001
+ file_type = None
1002
+ if extension == '.csv':
1003
+ file_type = 'csv'
1004
+ elif extension == '.tsv':
1005
+ file_type = 'tsv'
1006
+ elif extension == '.parquet':
1007
+ file_type = 'parquet'
1008
+ elif extension in ['.pkl', '.pickle']:
1009
+ file_type = 'pkl'
1010
+ elif extension == '.xlsx':
1011
+ file_type = 'xlsx'
1012
+ elif extension == '.json' or extension == '.jsonl':
1013
+ file_type = 'json'
1014
+
1015
+ entry = self.file_scan_cache.get_file_entry(abs_path) or {
1016
+ 'fileId': abs_path,
1017
+ 'fileName': item.name,
1018
+ 'filePath': abs_path,
1019
+ 'fileType': file_type,
1020
+ 'fileMtime': current_mtime
1021
+ }
1022
+
1023
+ # Check if file type is supported
1024
+ if file_type is None:
1025
+ if entry:
1026
+ entry['schema'] = {
1027
+ 'success': False,
1028
+ 'error': f'Unsupported file type: {extension}'
1029
+ }
1030
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1031
+ return entry
1032
+ else:
1033
+ return {
1034
+ 'success': False,
1035
+ 'error': f'Unsupported file type: {extension}'
1036
+ }
1037
+
1038
+ # Extract schema
1039
+ try:
1040
+ if file_type in ['csv', 'tsv']:
1041
+ separator = '\t' if file_type == 'tsv' else ','
1042
+ df = pd.read_csv(abs_path, sep=separator, nrows=5)
1043
+ elif file_type == 'parquet':
1044
+ df = ds.dataset(abs_path).scanner().head(5).to_pandas()
1045
+ df = df.head(5) # Limit to first 5 rows
1046
+ elif file_type == 'xlsx':
1047
+ # Read .xlsx files using openpyxl engine
1048
+ df = pd.read_excel(abs_path, engine='openpyxl', nrows=5)
1049
+
1050
+ # Get sheet count and names using openpyxl
1051
+ try:
1052
+ workbook = load_workbook(abs_path, read_only=True)
1053
+ sheet_names = workbook.sheetnames
1054
+ total_sheets = len(sheet_names)
1055
+ workbook.close()
1056
+ except Exception:
1057
+ sheet_names = ['Sheet1'] # Default sheet name
1058
+ total_sheets = 1 # Default to 1 if we can't determine
1059
+ elif file_type == 'pkl':
1060
+ print(f"Reading pickle file: {abs_path}")
1061
+ data = pd.read_pickle(abs_path)
1062
+ print(f"Data: {data}")
1063
+ if isinstance(data, pd.DataFrame):
1064
+ print(f"Data is a DataFrame: {data.head(5)}")
1065
+ df = data.head(5) # Limit to first 5 rows
1066
+ else:
1067
+ # Handle non-DataFrame pickle data
1068
+ print(f"Data is not a DataFrame: {type(data).__name__}")
1069
+
1070
+ # Get file info
1071
+ file_info = self._get_file_type_info(str(item), extension)
1072
+ entry['file_info'] = file_info
1073
+
1074
+ # Check if file is binary (pickle files are always binary)
1075
+ is_binary = True
1076
+ file_info['is_binary'] = True
1077
+
1078
+ # Generate content preview for the pickle data
1079
+ content_preview, is_truncated = self._generate_pickle_data_preview(data)
1080
+ entry['content_preview'] = content_preview
1081
+ entry['is_truncated'] = is_truncated
1082
+
1083
+ # Create schema for non-DataFrame pickle data
1084
+ schema = {
1085
+ 'success': True,
1086
+ 'fileId': abs_path,
1087
+ 'fileName': item.name,
1088
+ 'filePath': abs_path,
1089
+ 'fileType': file_type,
1090
+ 'extractedAt': datetime.now().isoformat(),
1091
+ 'summary': f'Pickle file containing {type(data).__name__}',
1092
+ 'columns': [],
1093
+ 'totalRows': 1 if not hasattr(data, '__len__') else len(data) if hasattr(data, '__len__') else 1,
1094
+ 'totalColumns': 0,
1095
+ 'fileMtime': current_mtime
1096
+ }
1097
+
1098
+ # Cache the entry
1099
+ if entry:
1100
+ entry['schema'] = schema
1101
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1102
+
1103
+ return schema
1104
+ elif file_type == 'json':
1105
+ # Read and analyze JSON file
1106
+ json_data, file_format, is_truncated = self._read_json_file(abs_path)
1107
+ schema = self._analyze_json_structure(json_data, abs_path, item.name, current_mtime)
1108
+
1109
+ # Get file info
1110
+ file_info = self._get_file_type_info(str(item), extension)
1111
+ entry['file_info'] = file_info
1112
+
1113
+ # JSON files are text files
1114
+ file_info['is_binary'] = False
1115
+
1116
+ # Read file preview for JSON files
1117
+ preview = self._read_file_preview_optimized(str(item))
1118
+ entry['content_preview'] = preview[0]
1119
+ entry['is_truncated'] = preview[1]
1120
+
1121
+ # Process JSON preview
1122
+ if preview[0]:
1123
+ entry['json_info'] = self._process_json_preview(preview[0], str(item))
1124
+
1125
+ # Cache the entry
1126
+ if entry:
1127
+ entry['schema'] = schema
1128
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1129
+
1130
+ return schema
1131
+
1132
+ # Get file info for DataFrame pickle files and other file types
1133
+ file_info = self._get_file_type_info(str(item), extension)
1134
+ entry['file_info'] = file_info
1135
+
1136
+ # Check if file is binary
1137
+ is_binary = self._is_binary_file(str(item))
1138
+
1139
+ if is_binary:
1140
+ # For binary files, just provide basic info and file path
1141
+ entry['content_preview'] = f"Binary file: {str(item)}"
1142
+ entry['is_truncated'] = False
1143
+ # Mark as binary in file_info
1144
+ file_info['is_binary'] = True
1145
+ else:
1146
+ # Read file preview with limits for text files
1147
+ preview = self._read_file_preview_optimized(str(item))
1148
+ entry['content_preview'] = preview[0]
1149
+ entry['is_truncated'] = preview[1]
1150
+ file_info['is_binary'] = False
1151
+
1152
+ content = entry['content_preview']
1153
+
1154
+ if (file_type == 'csv' or file_type == 'tsv') and content:
1155
+ entry['csv_info'] = self._process_csv_preview(content, str(item))
1156
+ elif file_type == 'json' and content:
1157
+ entry['json_info'] = self._process_json_preview(content, str(item))
1158
+
1159
+ if df is not None:
1160
+ result = self._analyze_dataframe(df, file_type)
1161
+
1162
+ if result['success']:
1163
+ schema = {
1164
+ 'success': True,
1165
+ 'fileId': abs_path,
1166
+ 'fileName': Path(abs_path).name,
1167
+ 'filePath': abs_path,
1168
+ 'fileType': file_type,
1169
+ 'extractedAt': datetime.now().isoformat(),
1170
+ 'summary': result['summary'],
1171
+ 'totalRows': result['totalRows'],
1172
+ 'totalColumns': result['totalColumns'],
1173
+ 'columns': result['columns'],
1174
+ 'sampleData': result['sampleData'],
1175
+ 'fileMtime': current_mtime
1176
+ }
1177
+
1178
+ # Add sheet count and names for Excel files
1179
+ if file_type == 'xlsx' and 'total_sheets' in locals():
1180
+ schema['totalSheets'] = total_sheets
1181
+ schema['sheetNames'] = sheet_names
1182
+ schema['summary'] = f'Excel file with {total_sheets} sheet{"s" if total_sheets > 1 else ""} ({", ".join(sheet_names)}), {result["totalColumns"]} columns'
1183
+
1184
+ if entry:
1185
+ entry['schema'] = schema
1186
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1187
+
1188
+ return schema
1189
+ else:
1190
+ if entry:
1191
+ entry['schema'] = {
1192
+ 'success': False,
1193
+ 'error': f'Failed to extract schema: {result["error"]}'
1194
+ }
1195
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1196
+ return entry
1197
+
1198
+ except Exception as e:
1199
+ if entry:
1200
+ entry['schema'] = {
1201
+ 'success': False,
1202
+ 'error': f'Failed to extract schema: {str(e)}'
1203
+ }
1204
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1205
+ return entry
1206
+ return {
1207
+ 'success': False,
1208
+ 'error': f'Failed to extract schema: {str(e)}'
1209
+ }
1210
+
1211
+ except Exception as e:
1212
+ entry = self.file_scan_cache.get_file_entry(abs_path)
1213
+ if entry:
1214
+ entry['schema'] = {
1215
+ 'success': False,
1216
+ 'error': f'Error extracting schema: {str(e)}'
1217
+ }
1218
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1219
+ return entry
1220
+ return {
1221
+ 'success': False,
1222
+ 'error': f'Error extracting schema: {str(e)}'
1223
+ }
1224
+
1225
+ def get_scanned_directories(self) -> Dict[str, Any]:
1226
+ """Get list of currently scanned directories"""
1227
+ directories = self.file_scan_cache.get_scanned_directories()
1228
+
1229
+ return {
1230
+ 'directories': directories
1231
+ }
1232
+
1233
+ def update_scanned_directories(self, directories: List[Dict[str, Any]]) -> bool:
1234
+ """Update the list of scanned directories"""
1235
+ return self.file_scan_cache.set_scanned_directories(directories)
1236
+
1237
+ def shutdown(self):
1238
+ """Shutdown the service and cleanup resources"""
1239
+ if hasattr(self, '_executor'):
1240
+ self._executor.shutdown(wait=True)
1241
+
1242
+
1243
+ # Global instance
1244
+ _file_scanner_service = None
1245
+
1246
+
1247
+ def get_file_scanner_service() -> FileScannerService:
1248
+ """Get the global file scanner service instance"""
1249
+ global _file_scanner_service
1250
+ if _file_scanner_service is None:
1251
+ _file_scanner_service = FileScannerService()
1252
+ return _file_scanner_service