signalpilot-ai-internal 0.5.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. signalpilot_ai_internal/_version.py +1 -1
  2. signalpilot_ai_internal/cache_service.py +152 -1
  3. signalpilot_ai_internal/file_scanner_service.py +1396 -0
  4. signalpilot_ai_internal/handlers.py +478 -2
  5. signalpilot_ai_internal/html_export_template/README.md +23 -0
  6. signalpilot_ai_internal/html_export_template/conf.json +12 -0
  7. signalpilot_ai_internal/html_export_template/index.html.j2 +140 -0
  8. signalpilot_ai_internal/schema_search_config.yml +8 -8
  9. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json +5 -3
  10. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig +4 -2
  11. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/330.af2e9cb5def5ae2b84d5.js +1 -0
  12. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.972abe1d2d66f083f9cc.js +1 -0
  13. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.ad22ccddd74ee306fb56.js +1 -0
  14. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/57.e9acd2e1f9739037f1ab.js +1 -0
  15. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.2d75de1a8d2c3131a8db.js +1 -0
  16. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/786.770dc7bcab77e14cc135.js +7 -0
  17. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/801.ca9e114a30896b669a3c.js +1 -0
  18. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/880.25ddd15aca09421d3765.js +1 -0
  19. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.b05b2f0c9617ba28370d.js +1 -0
  20. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/third-party-licenses.json +20 -26
  21. {signalpilot_ai_internal-0.5.1.dist-info → signalpilot_ai_internal-0.10.0.dist-info}/METADATA +3 -1
  22. signalpilot_ai_internal-0.10.0.dist-info/RECORD +50 -0
  23. {signalpilot_ai_internal-0.5.1.dist-info → signalpilot_ai_internal-0.10.0.dist-info}/WHEEL +1 -1
  24. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/104.04e170724f369fcbaf19.js +0 -2
  25. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/104.04e170724f369fcbaf19.js.LICENSE.txt +0 -24
  26. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/188.e781cc4c87f2dbf290ec.js +0 -1
  27. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.72484b768a04f89bd3dd.js +0 -1
  28. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.9b4f05a99f5003f82094.js +0 -1
  29. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/606.90aaaae46b73dc3c08fb.js +0 -1
  30. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.3aa564fc148b37d1d719.js +0 -1
  31. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/839.7ea0c8f6af45369912f3.js +0 -1
  32. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/898.5251a593584dd5d131d5.js +0 -2
  33. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/898.5251a593584dd5d131d5.js.LICENSE.txt +0 -1
  34. signalpilot_ai_internal-0.5.1.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.59ffb91489066223094b.js +0 -1
  35. signalpilot_ai_internal-0.5.1.dist-info/RECORD +0 -48
  36. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/etc/jupyter/jupyter_server_config.d/signalpilot_ai.json +0 -0
  37. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/install.json +0 -0
  38. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/plugin.json +0 -0
  39. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.e2dadf63dc64d7b5f1ee.js +0 -0
  40. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.328403b5545f268b95c6.js +0 -0
  41. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.726e1da31a50868cb297.js +0 -0
  42. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.dbec4c2dc12e7b050dcc.js +0 -0
  43. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.fa432bdb7fb6b1c95ad6.js +0 -0
  44. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.37e271d7a80336daabe2.js +0 -0
  45. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.73c7a9290b7d35a8b9c1.js +0 -0
  46. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.b58fc0093d080b8ee61c.js +0 -0
  47. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js +0 -0
  48. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js.LICENSE.txt +0 -0
  49. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.9720593ee20b768da3ca.js +0 -0
  50. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.8e6edc9a965bdd578ca7.js +0 -0
  51. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.dc49867fafb03ea2ba4d.js +0 -0
  52. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/742.91e7b516c8699eea3373.js +0 -0
  53. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/888.34054db17bcf6e87ec95.js +0 -0
  54. {signalpilot_ai_internal-0.5.1.data → signalpilot_ai_internal-0.10.0.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/style.js +0 -0
  55. {signalpilot_ai_internal-0.5.1.dist-info → signalpilot_ai_internal-0.10.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1396 @@
1
+ """
2
+ File Scanner Service for SignalPilot AI.
3
+ Handles file scanning, schema extraction, and directory tracking.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import hashlib
9
+ import threading
10
+ import asyncio
11
+ from datetime import datetime, timedelta
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional, Any, Tuple
14
+ import pandas as pd
15
+ import numpy as np
16
+ import concurrent.futures
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ import pyarrow.dataset as ds
19
+ from openpyxl import load_workbook
20
+
21
+ from .cache_service import get_cache_service, get_file_scan_cache_manager
22
+
23
+
24
+ class FileScannerService:
25
+ """Service for scanning directories and extracting file schemas"""
26
+
27
+ def __init__(self):
28
+ self.cache_service = get_cache_service()
29
+ self.file_scan_cache = get_file_scan_cache_manager()
30
+ self._lock = threading.RLock()
31
+
32
+ # Data file extensions
33
+ self.DATA_EXTENSIONS = {'.csv', '.json', '.xlsx', '.xls', '.parquet', '.pkl', '.pickle',
34
+ '.feather', '.hdf5', '.h5', '.sql', '.db', '.sqlite', '.tsv', '.txt', '.ipynb'}
35
+
36
+ # Directories to exclude from search
37
+ self.EXCLUDE_DIRS = {'.git', '.ipynb_checkpoints', 'node_modules', '__pycache__',
38
+ '.venv', 'venv', 'env', '.pytest_cache', '.mypy_cache',
39
+ 'dist', 'build', '.tox', 'logs', '.vscode'}
40
+
41
+ # Thread pool for async schema extraction
42
+ self._executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="schema_extractor")
43
+
44
+ # Dedicated thread pool for directory scanning operations (I/O blocking)
45
+ self._dir_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="dir_scanner")
46
+
47
+ async def _list_directory_async(self, directory_path: Path) -> List[Path]:
48
+ """List directory contents using dedicated thread pool to avoid blocking."""
49
+ loop = asyncio.get_running_loop()
50
+ return await loop.run_in_executor(self._dir_executor, lambda: list(directory_path.iterdir()))
51
+
52
+ def __del__(self):
53
+ """Cleanup thread pools when service is destroyed."""
54
+ if hasattr(self, '_dir_executor'):
55
+ self._dir_executor.shutdown(wait=False)
56
+ if hasattr(self, '_executor'):
57
+ self._executor.shutdown(wait=False)
58
+
59
+ def _get_directory_hash(self, directory: str) -> str:
60
+ """Generate a hash for a directory path"""
61
+ return hashlib.md5(directory.encode()).hexdigest()[:16]
62
+
63
+
64
+ def _is_binary_file(self, filepath: str, chunk_size: int = 512) -> bool:
65
+ """Ultra-fast binary file detection with minimal I/O"""
66
+ try:
67
+ with open(filepath, 'rb') as f:
68
+ chunk = f.read(chunk_size)
69
+ if not chunk:
70
+ return False
71
+ # Fast null byte check - if any null bytes, it's binary
72
+ if b'\x00' in chunk:
73
+ return True
74
+ # Quick printable ratio check using bytes directly
75
+ printable = sum(1 for b in chunk if 32 <= b <= 126 or b in (9, 10, 13))
76
+ return (printable / len(chunk)) < 0.7
77
+ except (IOError, OSError):
78
+ return True
79
+
80
+ def _generate_pickle_data_preview(self, data: Any, max_items: int = 3, max_chars: int = 1000) -> Tuple[str, bool]:
81
+ """
82
+ Generate a content preview for non-DataFrame pickle data.
83
+ Returns (preview_content, is_truncated)
84
+ """
85
+ try:
86
+ data_type = type(data).__name__
87
+
88
+ if isinstance(data, (list, tuple)):
89
+ if len(data) == 0:
90
+ return f"Empty {data_type}", False
91
+
92
+ preview_items = []
93
+ for i, item in enumerate(data[:max_items]):
94
+ item_str = str(item)
95
+ if len(item_str) > 200:
96
+ item_str = item_str[:200] + "..."
97
+ preview_items.append(f"[{i}]: {item_str}")
98
+
99
+ preview = f"{data_type} with {len(data)} items:\n" + "\n".join(preview_items)
100
+ is_truncated = len(data) > max_items
101
+
102
+ if len(preview) > max_chars:
103
+ preview = preview[:max_chars] + "..."
104
+ is_truncated = True
105
+
106
+ return preview, is_truncated
107
+
108
+ elif isinstance(data, dict):
109
+ if len(data) == 0:
110
+ return f"Empty {data_type}", False
111
+
112
+ preview_items = []
113
+ for i, (key, value) in enumerate(list(data.items())[:max_items]):
114
+ key_str = str(key)
115
+ value_str = str(value)
116
+ if len(value_str) > 150:
117
+ value_str = value_str[:150] + "..."
118
+ preview_items.append(f"'{key_str}': {value_str}")
119
+
120
+ preview = f"{data_type} with {len(data)} keys:\n" + "\n".join(preview_items)
121
+ is_truncated = len(data) > max_items
122
+
123
+ if len(preview) > max_chars:
124
+ preview = preview[:max_chars] + "..."
125
+ is_truncated = True
126
+
127
+ return preview, is_truncated
128
+
129
+ elif isinstance(data, np.ndarray):
130
+ shape_str = str(data.shape)
131
+ dtype_str = str(data.dtype)
132
+
133
+ if data.size == 0:
134
+ return f"Empty numpy array: shape={shape_str}, dtype={dtype_str}", False
135
+
136
+ # Show first few elements
137
+ flat_data = data.flatten()[:max_items]
138
+ elements_str = ", ".join([str(x) for x in flat_data])
139
+
140
+ preview = f"numpy.ndarray: shape={shape_str}, dtype={dtype_str}\nFirst elements: [{elements_str}]"
141
+ is_truncated = data.size > max_items
142
+
143
+ if len(preview) > max_chars:
144
+ preview = preview[:max_chars] + "..."
145
+ is_truncated = True
146
+
147
+ return preview, is_truncated
148
+
149
+ elif isinstance(data, str):
150
+ if len(data) == 0:
151
+ return "Empty string", False
152
+
153
+ preview = f"String ({len(data)} chars): {data[:max_chars]}"
154
+ is_truncated = len(data) > max_chars
155
+ return preview, is_truncated
156
+
157
+ elif isinstance(data, (int, float, bool)):
158
+ return f"{data_type}: {data}", False
159
+
160
+ else:
161
+ # For other types, try to convert to string
162
+ data_str = str(data)
163
+ if len(data_str) > max_chars:
164
+ data_str = data_str[:max_chars] + "..."
165
+ is_truncated = True
166
+ else:
167
+ is_truncated = False
168
+
169
+ return f"{data_type}: {data_str}", is_truncated
170
+
171
+ except Exception as e:
172
+ return f"Error generating preview for {type(data).__name__}: {str(e)}", False
173
+
174
+ def _parse_json_array_simple(self, filepath: str, max_items: int = 5) -> Tuple[List[Any], bool]:
175
+ """
176
+ Simple JSON array parsing that reads first chunk and extracts items.
177
+ More robust for very large files.
178
+ Returns (items_list, is_truncated)
179
+ """
180
+ import json
181
+
182
+ try:
183
+ # Read first 50KB of the file
184
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
185
+ chunk = f.read(50000) # 50KB chunk
186
+
187
+ # Try to find the opening bracket
188
+ bracket_pos = chunk.find('[')
189
+ if bracket_pos == -1:
190
+ # Not an array, try as single object
191
+ f.seek(0)
192
+ try:
193
+ single_obj = json.load(f)
194
+ return [single_obj], False
195
+ except:
196
+ return [], False
197
+
198
+ # Find the first few complete JSON objects in the chunk
199
+ items = []
200
+ current_pos = bracket_pos + 1
201
+
202
+ while len(items) < max_items and current_pos < len(chunk):
203
+ # Find the next complete JSON object
204
+ brace_count = 0
205
+ start_pos = current_pos
206
+ in_string = False
207
+ escape_next = False
208
+
209
+ for i in range(current_pos, len(chunk)):
210
+ char = chunk[i]
211
+
212
+ if escape_next:
213
+ escape_next = False
214
+ continue
215
+ elif char == '\\':
216
+ escape_next = True
217
+ continue
218
+ elif char == '"' and not escape_next:
219
+ in_string = not in_string
220
+ elif not in_string:
221
+ if char == '{':
222
+ brace_count += 1
223
+ elif char == '}':
224
+ brace_count -= 1
225
+ if brace_count == 0:
226
+ # Found complete object
227
+ try:
228
+ obj_str = chunk[start_pos:i+1].strip()
229
+ if obj_str.startswith(','):
230
+ obj_str = obj_str[1:].strip()
231
+ if obj_str:
232
+ obj = json.loads(obj_str)
233
+ items.append(obj)
234
+ except:
235
+ pass
236
+ current_pos = i + 1
237
+ break
238
+ elif char == ',' and brace_count == 0 and not in_string:
239
+ # End of current item
240
+ try:
241
+ obj_str = chunk[start_pos:i].strip()
242
+ if obj_str.startswith(','):
243
+ obj_str = obj_str[1:].strip()
244
+ if obj_str:
245
+ obj = json.loads(obj_str)
246
+ items.append(obj)
247
+ except:
248
+ pass
249
+ current_pos = i + 1
250
+ break
251
+ else:
252
+ break
253
+
254
+ # Check if there's more content
255
+ remaining = f.read(1000)
256
+ is_truncated = len(remaining) > 0 or len(items) == max_items
257
+
258
+ return items, is_truncated
259
+
260
+ except Exception:
261
+ return [], False
262
+
263
+ def _read_json_file(self, filepath: str, max_items: int = 5) -> Tuple[List[Any], str, bool]:
264
+ """
265
+ Read JSON file with smart loading strategy.
266
+ Returns (data_list, file_format, is_truncated)
267
+ file_format: 'object', 'array', or 'jsonl'
268
+ """
269
+ import json
270
+
271
+ try:
272
+ # Check file size
273
+ file_size = os.path.getsize(filepath)
274
+ size_mb = file_size / (1024 * 1024)
275
+
276
+ # For small files (< 10MB), use simple json.load()
277
+ if size_mb < 10:
278
+ try:
279
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
280
+ data = json.load(f)
281
+
282
+ if isinstance(data, list):
283
+ return data[:max_items], 'array', len(data) > max_items
284
+ elif isinstance(data, dict):
285
+ return [data], 'object', False
286
+ else:
287
+ return [data], 'primitive', False
288
+ except json.JSONDecodeError:
289
+ # Try as JSONL if JSON parsing fails
290
+ pass
291
+
292
+ # For large files or if JSON parsing failed, try incremental parsing
293
+ try:
294
+ # First check if it's JSONL (line-delimited JSON)
295
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
296
+ first_line = f.readline().strip()
297
+ if first_line and first_line.startswith('{') and first_line.endswith('}'):
298
+ # Likely JSONL format
299
+ f.seek(0)
300
+ items = []
301
+ for i, line in enumerate(f):
302
+ if i >= max_items:
303
+ break
304
+ line = line.strip()
305
+ if line:
306
+ try:
307
+ item = json.loads(line)
308
+ items.append(item)
309
+ except:
310
+ continue
311
+ return items, 'jsonl', True # Assume truncated for large files
312
+
313
+ # Try simple array parsing
314
+ items, is_truncated = self._parse_json_array_simple(filepath, max_items)
315
+ if items:
316
+ return items, 'array', is_truncated
317
+
318
+ # Fallback: try to read first few lines as individual JSON objects
319
+ try:
320
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
321
+ items = []
322
+ for i, line in enumerate(f):
323
+ if i >= max_items:
324
+ break
325
+ line = line.strip()
326
+ if line and (line.startswith('{') or line.startswith('[')):
327
+ try:
328
+ obj = json.loads(line)
329
+ items.append(obj)
330
+ except:
331
+ continue
332
+ if items:
333
+ return items, 'jsonl', True
334
+ except:
335
+ pass
336
+
337
+ # Final fallback: try to read as single object
338
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
339
+ content = f.read(10000) # Read first 10KB
340
+ try:
341
+ data = json.loads(content)
342
+ if isinstance(data, dict):
343
+ return [data], 'object', True
344
+ except:
345
+ pass
346
+
347
+ return [], 'unknown', False
348
+
349
+ except Exception:
350
+ return [], 'unknown', False
351
+
352
+ except Exception:
353
+ return [], 'unknown', False
354
+
355
+ def _infer_json_type(self, value: Any) -> str:
356
+ """Infer data type from JSON value"""
357
+ if value is None:
358
+ return 'null'
359
+ elif isinstance(value, bool):
360
+ return 'boolean'
361
+ elif isinstance(value, int):
362
+ return 'integer'
363
+ elif isinstance(value, float):
364
+ return 'float'
365
+ elif isinstance(value, str):
366
+ # Try to detect if it's a numeric string
367
+ try:
368
+ float(value)
369
+ return 'numeric_string'
370
+ except:
371
+ return 'string'
372
+ elif isinstance(value, list):
373
+ return 'array'
374
+ elif isinstance(value, dict):
375
+ return 'object'
376
+ else:
377
+ return 'unknown'
378
+
379
+ def _extract_keys_from_object(self, obj: dict, prefix: str = "", max_depth: int = 3, max_fields: int = 50) -> Dict[str, Dict[str, Any]]:
380
+ """Recursively extract keys from nested objects with truncation limits"""
381
+ keys = {}
382
+
383
+ if max_depth <= 0 or len(keys) >= max_fields:
384
+ return keys
385
+
386
+ for i, (key, value) in enumerate(obj.items()):
387
+ if len(keys) >= max_fields:
388
+ break
389
+
390
+ full_key = f"{prefix}.{key}" if prefix else key
391
+ value_type = self._infer_json_type(value)
392
+
393
+ # Truncate sample values to 50 characters max
394
+ sample_value = None
395
+ if not isinstance(value, (dict, list)):
396
+ sample_str = str(value)
397
+ sample_value = sample_str[:50] + "..." if len(sample_str) > 50 else sample_str
398
+
399
+ keys[full_key] = {
400
+ 'type': value_type,
401
+ 'sample_value': sample_value,
402
+ 'is_nested': isinstance(value, (dict, list)),
403
+ 'depth': len(prefix.split('.')) if prefix else 0
404
+ }
405
+
406
+ # Recursively extract nested keys (reduced depth)
407
+ if isinstance(value, dict) and max_depth > 1 and len(keys) < max_fields:
408
+ nested_keys = self._extract_keys_from_object(value, full_key, max_depth - 1, max_fields - len(keys))
409
+ keys.update(nested_keys)
410
+ elif isinstance(value, list) and value and isinstance(value[0], dict) and len(keys) < max_fields:
411
+ # For arrays of objects, analyze the first object only
412
+ nested_keys = self._extract_keys_from_object(value[0], f"{full_key}[0]", max_depth - 1, max_fields - len(keys))
413
+ keys.update(nested_keys)
414
+
415
+ return keys
416
+
417
+ def _analyze_json_structure(self, data_list: List[Any], filepath: str, filename: str, current_mtime: float) -> Dict[str, Any]:
418
+ """
419
+ Analyze JSON structure and extract comprehensive schema information.
420
+ """
421
+ if not data_list:
422
+ return {
423
+ 'success': False,
424
+ 'error': f'No data found in JSON file. File may be too large ({os.path.getsize(filepath) / (1024*1024):.1f}MB) or malformed. Try using a smaller sample or check file format.'
425
+ }
426
+
427
+ # Analyze structure
428
+ all_keys = {}
429
+ sample_data = []
430
+ total_records = len(data_list)
431
+ max_depth = 0
432
+ structure_types = set()
433
+
434
+ # Process each record
435
+ for i, record in enumerate(data_list):
436
+ if isinstance(record, dict):
437
+ structure_types.add('object')
438
+ # Extract keys from this record
439
+ record_keys = self._extract_keys_from_object(record)
440
+
441
+ # Merge with all_keys, keeping track of types and sample values
442
+ for key, key_info in record_keys.items():
443
+ if key not in all_keys:
444
+ all_keys[key] = {
445
+ 'type': key_info['type'],
446
+ 'sample_values': [],
447
+ 'is_consistent': True,
448
+ 'depth': key_info['depth']
449
+ }
450
+
451
+ # Add sample value
452
+ if key_info['sample_value']:
453
+ all_keys[key]['sample_values'].append(key_info['sample_value'])
454
+
455
+ # Check type consistency
456
+ if all_keys[key]['type'] != key_info['type']:
457
+ all_keys[key]['is_consistent'] = False
458
+ # Update to most common type or 'mixed'
459
+ all_keys[key]['type'] = 'mixed'
460
+
461
+ # Track max depth
462
+ max_depth = max(max_depth, key_info['depth'])
463
+
464
+ # Add to sample data (first 5 records, truncated)
465
+ if i < 5:
466
+ # Truncate sample data to essential fields only
467
+ truncated_record = {}
468
+ for j, (k, v) in enumerate(record.items()):
469
+ if j >= 5: # Only first 5 fields
470
+ break
471
+ if isinstance(v, (dict, list)):
472
+ truncated_record[k] = f"<{type(v).__name__}>"
473
+ else:
474
+ val_str = str(v)
475
+ truncated_record[k] = val_str[:50] + "..." if len(val_str) > 50 else val_str
476
+ sample_data.append(truncated_record)
477
+
478
+ elif isinstance(record, list):
479
+ structure_types.add('array')
480
+ if i < 3:
481
+ # Truncate arrays to first 5 items
482
+ truncated_array = record[:5]
483
+ sample_data.append(truncated_array)
484
+ else:
485
+ structure_types.add('primitive')
486
+ if i < 3:
487
+ # Truncate primitive values
488
+ val_str = str(record)
489
+ truncated_val = val_str[:50] + "..." if len(val_str) > 50 else val_str
490
+ sample_data.append(truncated_val)
491
+
492
+ # Convert keys to columns format with truncation
493
+ columns = []
494
+ for key, key_info in all_keys.items():
495
+ # Determine final type
496
+ final_type = key_info['type']
497
+ if not key_info['is_consistent']:
498
+ final_type = 'mixed'
499
+
500
+ # Get unique sample values (limit to 2, truncate to 30 chars each)
501
+ unique_samples = []
502
+ for sample in list(set(key_info['sample_values']))[:2]:
503
+ sample_str = str(sample)
504
+ truncated_sample = sample_str[:30] + "..." if len(sample_str) > 30 else sample_str
505
+ unique_samples.append(truncated_sample)
506
+
507
+ columns.append({
508
+ 'name': key,
509
+ 'dataType': final_type,
510
+ 'description': f'Field {key} of type {final_type}',
511
+ 'sample_values': unique_samples,
512
+ 'is_consistent': key_info['is_consistent'],
513
+ 'depth': key_info['depth']
514
+ })
515
+
516
+ # Determine file format description
517
+ if 'object' in structure_types and len(structure_types) == 1:
518
+ format_desc = 'JSON object'
519
+ elif 'array' in structure_types and len(structure_types) == 1:
520
+ format_desc = 'JSON array'
521
+ elif 'jsonl' in str(filepath).lower():
522
+ format_desc = 'Line-delimited JSON (JSONL)'
523
+ else:
524
+ format_desc = 'Mixed JSON structure'
525
+
526
+ # Calculate statistics
527
+ unique_keys_count = len(all_keys)
528
+ consistent_keys = sum(1 for k in all_keys.values() if k['is_consistent'])
529
+
530
+ return {
531
+ 'success': True,
532
+ 'fileId': filepath,
533
+ 'fileName': filename,
534
+ 'filePath': filepath,
535
+ 'fileType': 'json',
536
+ 'extractedAt': datetime.now().isoformat(),
537
+ 'summary': f'{format_desc} with {total_records} record{"s" if total_records != 1 else ""}, {unique_keys_count} unique field{"s" if unique_keys_count != 1 else ""}',
538
+ 'totalRows': total_records,
539
+ 'totalColumns': unique_keys_count,
540
+ 'columns': columns,
541
+ 'sampleData': sample_data,
542
+ 'fileMtime': current_mtime,
543
+ 'structure_info': {
544
+ 'format_type': format_desc,
545
+ 'max_depth': max_depth,
546
+ 'consistent_fields': consistent_keys,
547
+ 'total_fields': unique_keys_count,
548
+ 'structure_types': list(structure_types)
549
+ }
550
+ }
551
+
552
+ def _read_file_preview_optimized(self, filepath: str, max_chars: int = 2000, max_newlines: int = 5) -> Tuple[str, bool]:
553
+ """
554
+ Ultra-fast file preview reader using efficient buffered reading.
555
+ Reads first 2000 characters OR first 3 newlines, whichever comes first.
556
+ Returns (content, is_truncated)
557
+ """
558
+ try:
559
+ file_size = os.path.getsize(filepath)
560
+
561
+ # For very small files, read directly
562
+ if file_size <= max_chars:
563
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
564
+ content = f.read()
565
+ return content, False
566
+
567
+ # For larger files, read in chunks and stop at limits
568
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
569
+ content = f.read(max_chars)
570
+
571
+ # Count newlines in the content we read
572
+ newline_count = content.count('\n')
573
+
574
+ # If we have 3 or fewer newlines, we're good
575
+ if newline_count <= max_newlines:
576
+ # Check if there's more content (for truncation flag)
577
+ next_chunk = f.read(1)
578
+ is_truncated = bool(next_chunk)
579
+ return content, is_truncated
580
+
581
+ # If we have more than 3 newlines, truncate to first 3
582
+ lines = content.split('\n', max_newlines + 1)
583
+ if len(lines) > max_newlines:
584
+ # We have more than max_newlines, so truncate
585
+ truncated_content = '\n'.join(lines[:max_newlines])
586
+ return truncated_content, True
587
+ else:
588
+ # Exactly max_newlines, check if there's more content
589
+ next_chunk = f.read(1)
590
+ is_truncated = bool(next_chunk)
591
+ return content, is_truncated
592
+
593
+ except (UnicodeDecodeError, IOError, OSError):
594
+ try:
595
+ # Fallback for problematic files
596
+ with open(filepath, 'rb') as f:
597
+ raw_bytes = f.read(max_chars)
598
+ content = raw_bytes.decode('utf-8', errors='replace')
599
+ # Apply newline limit to fallback content too
600
+ lines = content.split('\n')
601
+ if len(lines) > max_newlines:
602
+ content = '\n'.join(lines[:max_newlines])
603
+ return content, True
604
+ return content, len(raw_bytes) == max_chars
605
+ except Exception:
606
+ return f"<Error reading file: {filepath}>", False
607
+
608
+ def _get_file_type_info(self, filepath: str, extension: str) -> Dict[str, Any]:
609
+ """Get optimized metadata about file type"""
610
+ file_info = {
611
+ 'extension': extension,
612
+ 'is_csv': extension == '.csv',
613
+ 'is_tsv': extension == '.tsv',
614
+ 'is_json': extension == '.json',
615
+ 'is_parquet': extension == '.parquet',
616
+ 'is_pkl': extension in ['.pkl', '.pickle'],
617
+ 'is_xlsx': extension == '.xlsx',
618
+ 'is_ipynb': extension == '.ipynb',
619
+ 'is_text': extension in ['.txt', '.md', '.py', '.js', '.ts', '.html', '.xml', '.ipynb'],
620
+ 'is_data': extension in ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.pkl', '.pickle', '.xlsx'],
621
+ 'is_binary': extension in ['.parquet', '.pkl', '.pickle', '.xlsx'] # Will be set later based on actual binary detection
622
+ }
623
+
624
+ try:
625
+ file_info['size_bytes'] = os.path.getsize(filepath)
626
+ file_info['last_modified'] = datetime.fromtimestamp(os.path.getmtime(filepath)).isoformat()
627
+ except (IOError, OSError):
628
+ file_info['size_bytes'] = 0
629
+ file_info['last_modified'] = None
630
+
631
+ return file_info
632
+
633
+ def _process_csv_preview(self, content: str, filepath: str) -> Dict[str, Any]:
634
+ """Fast CSV preview processing"""
635
+ # Split into lines efficiently, limit to what we need
636
+ newline_pos = content.find('\n')
637
+ if newline_pos == -1:
638
+ # Single line file
639
+ header = content.strip()
640
+ sample_rows = []
641
+ else:
642
+ # Multi-line file - get header and up to 5 sample rows
643
+ lines = content.split('\n', 6) # Get at most 6 lines (header + 5 samples)
644
+ header = lines[0] if lines[0] else None
645
+ sample_rows = [line for line in lines[1:6] if line.strip()]
646
+
647
+ result = {
648
+ 'type': 'csv',
649
+ 'preview_type': 'header_and_sample',
650
+ 'header': header,
651
+ 'sample_rows': sample_rows
652
+ }
653
+
654
+ if header:
655
+ result['estimated_columns'] = header.count(',') + 1
656
+
657
+ return result
658
+
659
+ def _process_json_preview(self, content: str, filepath: str) -> Dict[str, Any]:
660
+ """Enhanced JSON structure analysis with actual parsing"""
661
+ import json
662
+
663
+ result = {
664
+ 'type': 'json',
665
+ 'preview_type': 'parsed_structure'
666
+ }
667
+
668
+ try:
669
+ # Try to parse the content as JSON
670
+ data = json.loads(content)
671
+
672
+ if isinstance(data, dict):
673
+ result['structure'] = 'object'
674
+ result['keys'] = list(data.keys())[:5] # First 5 keys only
675
+ result['key_count'] = len(data.keys())
676
+
677
+ # Analyze key types (truncated)
678
+ key_types = {}
679
+ for i, (key, value) in enumerate(data.items()):
680
+ if i >= 5: # Limit to first 5 keys
681
+ break
682
+ key_types[key] = self._infer_json_type(value)
683
+ result['key_types'] = key_types
684
+
685
+ elif isinstance(data, list):
686
+ result['structure'] = 'array'
687
+ result['item_count'] = len(data)
688
+ result['sample_items'] = data[:3] # First 3 items
689
+
690
+ # Analyze item types
691
+ if data:
692
+ item_types = [self._infer_json_type(item) for item in data[:5]]
693
+ result['item_types'] = item_types
694
+
695
+ # If items are objects, get their keys
696
+ if isinstance(data[0], dict):
697
+ result['sample_keys'] = list(data[0].keys())[:5]
698
+
699
+ else:
700
+ result['structure'] = 'primitive'
701
+ result['value'] = str(data)[:50]
702
+ result['value_type'] = self._infer_json_type(data)
703
+
704
+ except json.JSONDecodeError:
705
+ # Fallback to line-by-line parsing for JSONL
706
+ lines = content.strip().split('\n')
707
+ if lines and lines[0].strip().startswith('{'):
708
+ result['structure'] = 'jsonl'
709
+ result['line_count'] = len(lines)
710
+
711
+ # Try to parse first few lines
712
+ sample_objects = []
713
+ for line in lines[:2]: # Only first 2 lines
714
+ try:
715
+ obj = json.loads(line.strip())
716
+ sample_objects.append(obj)
717
+ except:
718
+ continue
719
+
720
+ if sample_objects:
721
+ result['sample_objects'] = sample_objects
722
+ # Get keys from first object
723
+ if isinstance(sample_objects[0], dict):
724
+ result['sample_keys'] = list(sample_objects[0].keys())[:5]
725
+ else:
726
+ # Fallback to basic analysis
727
+ content_stripped = content.lstrip()
728
+ if not content_stripped:
729
+ result['structure'] = 'empty'
730
+ else:
731
+ first_char = content_stripped[0]
732
+ if first_char == '{':
733
+ result['structure'] = 'object'
734
+ result['estimated_keys'] = content_stripped.count('":')
735
+ elif first_char == '[':
736
+ result['structure'] = 'array'
737
+ result['estimated_items'] = content_stripped.count(',') + 1
738
+ else:
739
+ result['structure'] = 'unknown'
740
+
741
+ except Exception as e:
742
+ result['structure'] = 'error'
743
+ result['error'] = str(e)
744
+
745
+ return result
746
+
747
+ def _get_data_type(self, dtype_str: str) -> str:
748
+ """Map pandas dtypes to readable types"""
749
+ if dtype_str.startswith('int'):
750
+ return 'integer'
751
+ elif dtype_str.startswith('float'):
752
+ return 'float'
753
+ elif dtype_str == 'bool':
754
+ return 'boolean'
755
+ elif dtype_str.startswith('datetime'):
756
+ return 'datetime'
757
+ elif dtype_str == 'object':
758
+ return 'string'
759
+ else:
760
+ return 'string'
761
+
762
+ def _analyze_dataframe(self, df: pd.DataFrame, file_type: str) -> Dict[str, Any]:
763
+ """Analyze DataFrame and return schema information"""
764
+ # Get basic info
765
+ total_rows_sample = len(df)
766
+ total_columns = len(df.columns)
767
+
768
+ # Get column information
769
+ columns = []
770
+ sample_data = []
771
+
772
+ for col in df.columns:
773
+ dtype = str(df[col].dtype)
774
+ data_type = self._get_data_type(dtype)
775
+
776
+ # For object columns, try to infer if it's a date
777
+ if dtype == 'object' and not df[col].dropna().empty:
778
+ sample_val = df[col].dropna().iloc[0]
779
+ try:
780
+ pd.to_datetime(sample_val)
781
+ data_type = 'date'
782
+ except:
783
+ pass
784
+
785
+ columns.append({
786
+ 'name': str(col),
787
+ 'dataType': data_type,
788
+ 'description': f'Column {col} of type {data_type}'
789
+ })
790
+
791
+ # Get sample data (first 5 rows)
792
+ for _, row in df.head(5).iterrows():
793
+ sample_data.append(row.fillna('').astype(str).tolist())
794
+
795
+ return {
796
+ 'success': True,
797
+ 'totalRows': total_rows_sample,
798
+ 'totalColumns': total_columns,
799
+ 'columns': columns,
800
+ 'sampleData': sample_data,
801
+ 'summary': f'{file_type.upper()} file with {total_columns} columns'
802
+ }
803
+
804
+ async def scan_directories(self, paths: List[str], force_refresh: bool = False, workspace_root: str = None) -> Dict[str, Any]:
805
+ """Scan multiple directories and return file information without reprocessing unchanged files.
806
+
807
+ Strategy:
808
+ - Quickly enumerate directory contents (recursively)
809
+ - For files, return cached rich entries if mtime unchanged
810
+ - Otherwise, return a lightweight placeholder and process the file asynchronously to populate cache
811
+ - Schema extraction reuses existing caching/async logic
812
+ """
813
+ all_files: List[Dict[str, Any]] = []
814
+ scanned_directories: List[Dict[str, Any]] = []
815
+ cached_count = 0
816
+
817
+ if workspace_root is None:
818
+ workspace_root = os.getcwd()
819
+ original_root_path = Path(workspace_root)
820
+
821
+ scanned_directories_cache = self.file_scan_cache.get_scanned_directories()
822
+
823
+ for path in paths:
824
+ scanned_directory_cache = next((dir for dir in scanned_directories_cache if dir.get('path') == path), None)
825
+ dir_cached = True
826
+ abs_path = os.path.abspath(os.getcwd() if path in ('.', './') else path)
827
+
828
+ files_for_dir: List[Dict[str, Any]] = []
829
+ base_dir = Path(abs_path)
830
+ if not base_dir.exists() or not base_dir.is_dir():
831
+ scanned_directories.append({
832
+ 'path': abs_path,
833
+ 'file_count': 0,
834
+ 'scanned_at': datetime.now().isoformat(),
835
+ })
836
+ continue
837
+
838
+ # Walk directory tree with shallow Path.iterdir recursion to keep control and avoid reading file contents
839
+ stack: List[Tuple[Path, int]] = [(base_dir, 0)]
840
+ max_depth = 10
841
+ while stack:
842
+ current_dir, depth = stack.pop()
843
+ if depth > max_depth:
844
+ continue
845
+ try:
846
+ items = await self._list_directory_async(current_dir)
847
+ for item in items:
848
+ # Skip hidden and excluded directories
849
+ if item.is_dir():
850
+ if item.name.startswith('.') or item.name in self.EXCLUDE_DIRS:
851
+ continue
852
+ # Create directory entry (simple)
853
+ try:
854
+ relative_path = str(item.relative_to(original_root_path))
855
+ except ValueError:
856
+ relative_path = str(item.name)
857
+ try:
858
+ normalized_path = str(item.resolve().relative_to(original_root_path)) if item.resolve().is_relative_to(original_root_path) else str(item.name)
859
+ except Exception:
860
+ normalized_path = str(item)
861
+ dir_entry = {
862
+ 'id': str(item),
863
+ 'name': item.name,
864
+ 'absolute_path': str(item.absolute()),
865
+ 'path': str(item),
866
+ 'normalized_path': normalized_path,
867
+ 'relative_path': relative_path,
868
+ 'is_directory': True,
869
+ 'file_info': {'is_directory': True}
870
+ }
871
+ files_for_dir.append(dir_entry)
872
+ # Recurse
873
+ stack.append((item, depth + 1))
874
+ continue
875
+
876
+ # File handling
877
+ if item.name.startswith('.'):
878
+ continue
879
+
880
+ abs_file_path = str(item.absolute())
881
+ cached_entry = self.file_scan_cache.get_file_entry(abs_file_path)
882
+
883
+ current_mtime = None
884
+ try:
885
+ current_mtime = os.path.getmtime(abs_file_path)
886
+ except Exception:
887
+ pass
888
+
889
+ # Check if schema extraction has timed out (older than 60 seconds)
890
+ if (cached_entry and isinstance(cached_entry, dict) and
891
+ cached_entry.get('schema') and cached_entry.get('schema', {}).get('started_at')):
892
+ schema_started_at = cached_entry.get('schema', {}).get('started_at')
893
+ try:
894
+ started_time = datetime.fromisoformat(schema_started_at)
895
+ time_diff = datetime.now() - started_time
896
+ if time_diff > timedelta(seconds=60):
897
+ cached_entry['schema'] = {
898
+ 'loading': False,
899
+ 'error': 'Schema extraction timed out',
900
+ }
901
+ self.file_scan_cache.set_file_entry(abs_file_path, cached_entry)
902
+ continue
903
+ except (ValueError, TypeError):
904
+ # If we can't parse the datetime, continue with normal processing
905
+ pass
906
+
907
+ use_cached = False
908
+ if cached_entry and isinstance(cached_entry, dict) and not force_refresh:
909
+ cached_mtime = cached_entry.get('file_mtime') or cached_entry.get('fileMtime')
910
+ schema_info = cached_entry.get('schema')
911
+ if schema_info and isinstance(schema_info, dict) and schema_info.get('loading') is True:
912
+ use_cached = True
913
+ if current_mtime is not None and cached_mtime is not None and abs(float(cached_mtime)) == abs(float(current_mtime)):
914
+ use_cached = True
915
+
916
+ if use_cached:
917
+ entry = dict(cached_entry)
918
+ cached_count += 1
919
+ else:
920
+ dir_cached = False
921
+ # Lightweight placeholder while we process in background
922
+ try:
923
+ relative_path = str(item.relative_to(original_root_path))
924
+ except ValueError:
925
+ relative_path = str(item.name)
926
+ try:
927
+ normalized_path = str(item.resolve().relative_to(original_root_path)) if item.resolve().is_relative_to(original_root_path) else str(item.name)
928
+ except Exception:
929
+ normalized_path = str(item)
930
+ entry = {
931
+ 'id': str(item),
932
+ 'name': item.stem,
933
+ 'absolute_path': abs_file_path,
934
+ 'path': str(item),
935
+ 'normalized_path': normalized_path,
936
+ 'relative_path': relative_path,
937
+ 'is_directory': False,
938
+ 'file_mtime': current_mtime,
939
+ 'schema': {
940
+ 'loading': True,
941
+ 'started_at': datetime.now().isoformat(),
942
+ }
943
+ }
944
+
945
+ self.file_scan_cache.set_file_entry(abs_file_path, entry)
946
+ # Schedule background processing to populate cache
947
+ self._executor.submit(self.extract_schema, abs_file_path)
948
+
949
+ files_for_dir.append(entry)
950
+ except (IOError, OSError, PermissionError):
951
+ continue
952
+
953
+ all_files.extend(files_for_dir)
954
+ if dir_cached and scanned_directory_cache:
955
+ scanned_directories.append(scanned_directory_cache)
956
+ else:
957
+ scanned_directories.append({
958
+ 'path': abs_path,
959
+ 'file_count': len(files_for_dir),
960
+ 'scanned_at': datetime.now().isoformat(),
961
+ })
962
+
963
+ # De-duplicate by absolute path and directory flag
964
+ unique_seen = set()
965
+ deduped_files: List[Dict[str, Any]] = []
966
+ for entry in all_files:
967
+ abs_path_val = entry.get('absolute_path')
968
+ is_dir = bool(entry.get('is_directory'))
969
+ key = (abs_path_val, is_dir)
970
+ if not abs_path_val:
971
+ deduped_files.append(entry)
972
+ continue
973
+ if key in unique_seen:
974
+ continue
975
+ unique_seen.add(key)
976
+ deduped_files.append(entry)
977
+
978
+ return {
979
+ 'files': deduped_files,
980
+ 'scanned_directories': scanned_directories,
981
+ 'cached': cached_count > 0,
982
+ 'total_files': len(deduped_files)
983
+ }
984
+
985
+ def _analyze_notebook_structure(self, cells: List[Dict[str, Any]], metadata: Dict[str, Any],
986
+ filepath: str, filename: str, current_mtime: float,
987
+ total_cells: int) -> Dict[str, Any]:
988
+ """
989
+ Analyze notebook structure and extract schema information.
990
+ """
991
+ code_cells = []
992
+ markdown_cells = []
993
+
994
+ for idx, cell in enumerate(cells):
995
+ cell_type = cell.get('cell_type', 'unknown')
996
+ source = cell.get('source', [])
997
+
998
+ # Join source lines if it's a list
999
+ if isinstance(source, list):
1000
+ source_text = ''.join(source)
1001
+ else:
1002
+ source_text = source
1003
+
1004
+ cell_info = {
1005
+ 'index': idx,
1006
+ 'cell_type': cell_type,
1007
+ 'source': source_text[:500], # Truncate to 500 chars
1008
+ 'execution_count': cell.get('execution_count'),
1009
+ }
1010
+
1011
+ # Add outputs for code cells (truncated)
1012
+ if cell_type == 'code':
1013
+ outputs = cell.get('outputs', [])
1014
+ truncated_outputs = []
1015
+ for output in outputs[:3]: # Max 3 outputs
1016
+ output_type = output.get('output_type', 'unknown')
1017
+ output_info = {'type': output_type}
1018
+
1019
+ if output_type == 'stream':
1020
+ text = output.get('text', [])
1021
+ if isinstance(text, list):
1022
+ text = ''.join(text)
1023
+ output_info['text'] = text[:200] # Truncate
1024
+ elif output_type in ('execute_result', 'display_data'):
1025
+ data = output.get('data', {})
1026
+ if 'text/plain' in data:
1027
+ plain_text = data['text/plain']
1028
+ if isinstance(plain_text, list):
1029
+ plain_text = ''.join(plain_text)
1030
+ output_info['text'] = plain_text[:200]
1031
+ elif output_type == 'error':
1032
+ output_info['ename'] = output.get('ename')
1033
+ output_info['evalue'] = output.get('evalue', '')[:200]
1034
+
1035
+ truncated_outputs.append(output_info)
1036
+
1037
+ cell_info['outputs'] = truncated_outputs
1038
+ code_cells.append(cell_info)
1039
+ elif cell_type == 'markdown':
1040
+ markdown_cells.append(cell_info)
1041
+
1042
+ # Extract kernel info
1043
+ kernel_info = metadata.get('kernelspec', {})
1044
+ kernel_name = kernel_info.get('name', 'unknown')
1045
+ kernel_language = kernel_info.get('language', 'unknown')
1046
+
1047
+ # Create columns format for consistency with other file types
1048
+ columns = [
1049
+ {
1050
+ 'name': 'cell_index',
1051
+ 'dataType': 'integer',
1052
+ 'description': 'Cell index in notebook'
1053
+ },
1054
+ {
1055
+ 'name': 'cell_type',
1056
+ 'dataType': 'string',
1057
+ 'description': 'Type of cell (code, markdown, raw)'
1058
+ },
1059
+ {
1060
+ 'name': 'source',
1061
+ 'dataType': 'string',
1062
+ 'description': 'Cell source code or markdown content'
1063
+ },
1064
+ {
1065
+ 'name': 'execution_count',
1066
+ 'dataType': 'integer',
1067
+ 'description': 'Execution count for code cells'
1068
+ },
1069
+ {
1070
+ 'name': 'outputs',
1071
+ 'dataType': 'array',
1072
+ 'description': 'Cell outputs (for code cells)'
1073
+ }
1074
+ ]
1075
+
1076
+ summary = f'Jupyter Notebook with {total_cells} total cells ({len(code_cells)} code, {len(markdown_cells)} markdown), kernel: {kernel_name}'
1077
+
1078
+ return {
1079
+ 'success': True,
1080
+ 'fileId': filepath,
1081
+ 'fileName': filename,
1082
+ 'filePath': filepath,
1083
+ 'fileType': 'ipynb',
1084
+ 'extractedAt': datetime.now().isoformat(),
1085
+ 'summary': summary,
1086
+ 'totalRows': total_cells,
1087
+ 'totalColumns': len(columns),
1088
+ 'columns': columns,
1089
+ 'sampleData': cells,
1090
+ 'fileMtime': current_mtime,
1091
+ 'notebook_info': {
1092
+ 'total_cells': total_cells,
1093
+ 'code_cells': len(code_cells),
1094
+ 'markdown_cells': len(markdown_cells),
1095
+ 'kernel_name': kernel_name,
1096
+ 'kernel_language': kernel_language
1097
+ }
1098
+ }
1099
+
1100
+ def extract_schema(self, file_path: str, force_refresh: bool = False, start_cell: int = 0, end_cell: int = 5) -> Dict[str, Any]:
1101
+ """Extract schema from a file using pandas"""
1102
+ try:
1103
+ # Convert to absolute path
1104
+ abs_path = file_path
1105
+
1106
+ # Get current mtime for caching
1107
+ current_mtime = None
1108
+ try:
1109
+ current_mtime = os.path.getmtime(abs_path)
1110
+ except Exception:
1111
+ pass
1112
+
1113
+ item = Path(abs_path)
1114
+
1115
+ # Determine file type first
1116
+ extension = Path(abs_path).suffix.lower()
1117
+ file_type = None
1118
+ if extension == '.csv':
1119
+ file_type = 'csv'
1120
+ elif extension == '.tsv':
1121
+ file_type = 'tsv'
1122
+ elif extension == '.parquet':
1123
+ file_type = 'parquet'
1124
+ elif extension in ['.pkl', '.pickle']:
1125
+ file_type = 'pkl'
1126
+ elif extension == '.xlsx':
1127
+ file_type = 'xlsx'
1128
+ elif extension == '.json' or extension == '.jsonl':
1129
+ file_type = 'json'
1130
+ elif extension == '.ipynb':
1131
+ file_type = 'ipynb'
1132
+
1133
+ entry = self.file_scan_cache.get_file_entry(abs_path) or {
1134
+ 'fileId': abs_path,
1135
+ 'fileName': item.name,
1136
+ 'filePath': abs_path,
1137
+ 'fileType': file_type,
1138
+ 'fileMtime': current_mtime
1139
+ }
1140
+
1141
+ # Check if file type is supported
1142
+ if file_type is None:
1143
+ if entry:
1144
+ entry['schema'] = {
1145
+ 'success': False,
1146
+ 'error': f'Unsupported file type: {extension}'
1147
+ }
1148
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1149
+ return entry
1150
+ else:
1151
+ return {
1152
+ 'success': False,
1153
+ 'error': f'Unsupported file type: {extension}'
1154
+ }
1155
+
1156
+ # Extract schema
1157
+ try:
1158
+ if file_type in ['csv', 'tsv']:
1159
+ separator = '\t' if file_type == 'tsv' else ','
1160
+ df = pd.read_csv(abs_path, sep=separator, nrows=5)
1161
+ elif file_type == 'parquet':
1162
+ df = ds.dataset(abs_path).scanner().head(5).to_pandas()
1163
+ df = df.head(5) # Limit to first 5 rows
1164
+ elif file_type == 'xlsx':
1165
+ # Read .xlsx files using openpyxl engine
1166
+ df = pd.read_excel(abs_path, engine='openpyxl', nrows=5)
1167
+
1168
+ # Get sheet count and names using openpyxl
1169
+ try:
1170
+ workbook = load_workbook(abs_path, read_only=True)
1171
+ sheet_names = workbook.sheetnames
1172
+ total_sheets = len(sheet_names)
1173
+ workbook.close()
1174
+ except Exception:
1175
+ sheet_names = ['Sheet1'] # Default sheet name
1176
+ total_sheets = 1 # Default to 1 if we can't determine
1177
+ elif file_type == 'pkl':
1178
+ print(f"Reading pickle file: {abs_path}")
1179
+ data = pd.read_pickle(abs_path)
1180
+ print(f"Data: {data}")
1181
+ if isinstance(data, pd.DataFrame):
1182
+ print(f"Data is a DataFrame: {data.head(5)}")
1183
+ df = data.head(5) # Limit to first 5 rows
1184
+ else:
1185
+ # Handle non-DataFrame pickle data
1186
+ print(f"Data is not a DataFrame: {type(data).__name__}")
1187
+
1188
+ # Get file info
1189
+ file_info = self._get_file_type_info(str(item), extension)
1190
+ entry['file_info'] = file_info
1191
+
1192
+ # Check if file is binary (pickle files are always binary)
1193
+ is_binary = True
1194
+ file_info['is_binary'] = True
1195
+
1196
+ # Generate content preview for the pickle data
1197
+ content_preview, is_truncated = self._generate_pickle_data_preview(data)
1198
+ entry['content_preview'] = content_preview
1199
+ entry['is_truncated'] = is_truncated
1200
+
1201
+ # Create schema for non-DataFrame pickle data
1202
+ schema = {
1203
+ 'success': True,
1204
+ 'fileId': abs_path,
1205
+ 'fileName': item.name,
1206
+ 'filePath': abs_path,
1207
+ 'fileType': file_type,
1208
+ 'extractedAt': datetime.now().isoformat(),
1209
+ 'summary': f'Pickle file containing {type(data).__name__}',
1210
+ 'columns': [],
1211
+ 'totalRows': 1 if not hasattr(data, '__len__') else len(data) if hasattr(data, '__len__') else 1,
1212
+ 'totalColumns': 0,
1213
+ 'fileMtime': current_mtime
1214
+ }
1215
+
1216
+ # Cache the entry
1217
+ if entry:
1218
+ entry['schema'] = schema
1219
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1220
+
1221
+ return schema
1222
+ elif file_type == 'json':
1223
+ # Read and analyze JSON file
1224
+ json_data, file_format, is_truncated = self._read_json_file(abs_path)
1225
+ schema = self._analyze_json_structure(json_data, abs_path, item.name, current_mtime)
1226
+
1227
+ # Get file info
1228
+ file_info = self._get_file_type_info(str(item), extension)
1229
+ entry['file_info'] = file_info
1230
+
1231
+ # JSON files are text files
1232
+ file_info['is_binary'] = False
1233
+
1234
+ # Read file preview for JSON files
1235
+ preview = self._read_file_preview_optimized(str(item))
1236
+ entry['content_preview'] = preview[0]
1237
+ entry['is_truncated'] = preview[1]
1238
+
1239
+ # Process JSON preview
1240
+ if preview[0]:
1241
+ entry['json_info'] = self._process_json_preview(preview[0], str(item))
1242
+
1243
+ # Cache the entry
1244
+ if entry:
1245
+ entry['schema'] = schema
1246
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1247
+
1248
+ return schema
1249
+ elif file_type == 'ipynb':
1250
+ # Read notebook file once
1251
+ with open(abs_path, 'r', encoding='utf-8') as f:
1252
+ notebook = json.load(f)
1253
+
1254
+ cells_all = notebook.get('cells', [])
1255
+ metadata = notebook.get('metadata', {})
1256
+ total_cells = len(cells_all)
1257
+
1258
+ # Clamp indices and extract range
1259
+ start_idx = max(0, start_cell)
1260
+ end_idx = min(total_cells, end_cell)
1261
+ cells = cells_all[start_idx:end_idx]
1262
+
1263
+ schema = self._analyze_notebook_structure(cells, metadata, abs_path, item.name, current_mtime, total_cells)
1264
+
1265
+ # Get file info
1266
+ file_info = self._get_file_type_info(str(item), extension)
1267
+ entry['file_info'] = file_info
1268
+ file_info['is_binary'] = False
1269
+
1270
+ # Cache the entry
1271
+ entry['schema'] = schema
1272
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1273
+
1274
+ return schema
1275
+
1276
+ # Get file info for DataFrame pickle files and other file types
1277
+ file_info = self._get_file_type_info(str(item), extension)
1278
+ entry['file_info'] = file_info
1279
+
1280
+ # Check if file is binary
1281
+ is_binary = self._is_binary_file(str(item))
1282
+
1283
+ if is_binary:
1284
+ # For binary files, just provide basic info and file path
1285
+ entry['content_preview'] = f"Binary file: {str(item)}"
1286
+ entry['is_truncated'] = False
1287
+ # Mark as binary in file_info
1288
+ file_info['is_binary'] = True
1289
+ else:
1290
+ # Read file preview with limits for text files
1291
+ preview = self._read_file_preview_optimized(str(item))
1292
+ entry['content_preview'] = preview[0]
1293
+ entry['is_truncated'] = preview[1]
1294
+ file_info['is_binary'] = False
1295
+
1296
+ content = entry['content_preview']
1297
+
1298
+ if (file_type == 'csv' or file_type == 'tsv') and content:
1299
+ entry['csv_info'] = self._process_csv_preview(content, str(item))
1300
+ elif file_type == 'json' and content:
1301
+ entry['json_info'] = self._process_json_preview(content, str(item))
1302
+
1303
+ if df is not None:
1304
+ result = self._analyze_dataframe(df, file_type)
1305
+
1306
+ if result['success']:
1307
+ schema = {
1308
+ 'success': True,
1309
+ 'fileId': abs_path,
1310
+ 'fileName': Path(abs_path).name,
1311
+ 'filePath': abs_path,
1312
+ 'fileType': file_type,
1313
+ 'extractedAt': datetime.now().isoformat(),
1314
+ 'summary': result['summary'],
1315
+ 'totalRows': result['totalRows'],
1316
+ 'totalColumns': result['totalColumns'],
1317
+ 'columns': result['columns'],
1318
+ 'sampleData': result['sampleData'],
1319
+ 'fileMtime': current_mtime
1320
+ }
1321
+
1322
+ # Add sheet count and names for Excel files
1323
+ if file_type == 'xlsx' and 'total_sheets' in locals():
1324
+ schema['totalSheets'] = total_sheets
1325
+ schema['sheetNames'] = sheet_names
1326
+ schema['summary'] = f'Excel file with {total_sheets} sheet{"s" if total_sheets > 1 else ""} ({", ".join(sheet_names)}), {result["totalColumns"]} columns'
1327
+
1328
+ if entry:
1329
+ entry['schema'] = schema
1330
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1331
+
1332
+ return schema
1333
+ else:
1334
+ if entry:
1335
+ entry['schema'] = {
1336
+ 'success': False,
1337
+ 'error': f'Failed to extract schema: {result["error"]}'
1338
+ }
1339
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1340
+ return entry
1341
+
1342
+ except Exception as e:
1343
+ if entry:
1344
+ entry['schema'] = {
1345
+ 'success': False,
1346
+ 'error': f'Failed to extract schema: {str(e)}'
1347
+ }
1348
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1349
+ return entry
1350
+ return {
1351
+ 'success': False,
1352
+ 'error': f'Failed to extract schema: {str(e)}'
1353
+ }
1354
+
1355
+ except Exception as e:
1356
+ entry = self.file_scan_cache.get_file_entry(abs_path)
1357
+ if entry:
1358
+ entry['schema'] = {
1359
+ 'success': False,
1360
+ 'error': f'Error extracting schema: {str(e)}'
1361
+ }
1362
+ self.file_scan_cache.set_file_entry(abs_path, entry)
1363
+ return entry
1364
+ return {
1365
+ 'success': False,
1366
+ 'error': f'Error extracting schema: {str(e)}'
1367
+ }
1368
+
1369
+ def get_scanned_directories(self) -> Dict[str, Any]:
1370
+ """Get list of currently scanned directories"""
1371
+ directories = self.file_scan_cache.get_scanned_directories()
1372
+
1373
+ return {
1374
+ 'directories': directories
1375
+ }
1376
+
1377
+ def update_scanned_directories(self, directories: List[Dict[str, Any]]) -> bool:
1378
+ """Update the list of scanned directories"""
1379
+ return self.file_scan_cache.set_scanned_directories(directories)
1380
+
1381
+ def shutdown(self):
1382
+ """Shutdown the service and cleanup resources"""
1383
+ if hasattr(self, '_executor'):
1384
+ self._executor.shutdown(wait=True)
1385
+
1386
+
1387
+ # Global instance
1388
+ _file_scanner_service = None
1389
+
1390
+
1391
+ def get_file_scanner_service() -> FileScannerService:
1392
+ """Get the global file scanner service instance"""
1393
+ global _file_scanner_service
1394
+ if _file_scanner_service is None:
1395
+ _file_scanner_service = FileScannerService()
1396
+ return _file_scanner_service