signalpilot-ai-internal 0.10.22__py3-none-any.whl → 0.11.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. signalpilot_ai_internal/_version.py +1 -1
  2. signalpilot_ai_internal/cache_service.py +22 -21
  3. signalpilot_ai_internal/composio_handlers.py +224 -0
  4. signalpilot_ai_internal/composio_service.py +511 -0
  5. signalpilot_ai_internal/database_config_handlers.py +182 -0
  6. signalpilot_ai_internal/database_config_service.py +166 -0
  7. signalpilot_ai_internal/databricks_schema_service.py +19 -14
  8. signalpilot_ai_internal/file_scanner_service.py +5 -146
  9. signalpilot_ai_internal/handlers.py +317 -8
  10. signalpilot_ai_internal/integrations_config.py +256 -0
  11. signalpilot_ai_internal/log_utils.py +31 -0
  12. signalpilot_ai_internal/mcp_handlers.py +33 -9
  13. signalpilot_ai_internal/mcp_service.py +94 -142
  14. signalpilot_ai_internal/oauth_token_store.py +141 -0
  15. signalpilot_ai_internal/schema_search_config.yml +17 -11
  16. signalpilot_ai_internal/schema_search_service.py +30 -10
  17. signalpilot_ai_internal/signalpilot_home.py +961 -0
  18. signalpilot_ai_internal/snowflake_schema_service.py +2 -0
  19. signalpilot_ai_internal/unified_database_schema_service.py +2 -0
  20. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig → signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json +15 -48
  21. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json → signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig +9 -52
  22. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.bab318d6caadb055e29c.js +1 -0
  23. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/129.868ca665e6fc225c20a0.js +1 -0
  24. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/179.fd45a2e75d471d0aa3b9.js +7 -0
  25. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.81105a94aa873fc51a94.js +1 -0
  26. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.a002dd4630d3b6404a90.js +1 -0
  27. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.cc6f6ecacd703bcdb468.js +1 -0
  28. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.817a883549d55a0e0576.js +1 -0
  29. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.a4daecd44f1e9364e44a.js +1 -0
  30. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.667225aab294fb5ed161.js +1 -0
  31. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/447.8138af2522716e5a926f.js +1 -0
  32. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.925c73e32f3c07448da0.js +1 -0
  33. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/477.aaa4cc9e87801fb45f5b.js +1 -0
  34. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.370056149a59022b700c.js +1 -0
  35. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/510.868ca665e6fc225c20a0.js +1 -0
  36. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.835f97f7ccfc70ff5c93.js +1 -0
  37. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/57.6c13335f73de089d6b1e.js +1 -0
  38. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/574.ad2709e91ebcac5bbe68.js +1 -0
  39. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.bddbab8e464fe31f0393.js +1 -0
  40. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.fda1bcdb10497b0a6ade.js +1 -0
  41. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.d046701f475fcbf6697d.js +1 -0
  42. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.c306dffd4cfe8a613d13.js +1 -0
  43. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/801.e39898b6f336539f228c.js +1 -0
  44. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/880.77cc0ca10a1860df1b52.js +1 -0
  45. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/936.4e2850b2af985ed0d378.js +1 -0
  46. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/956.eeffe67d7781fd63ef4b.js +2 -0
  47. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.055f50d20a31f3068c72.js +1 -0
  48. {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/third-party-licenses.json +29 -29
  49. {signalpilot_ai_internal-0.10.22.dist-info → signalpilot_ai_internal-0.11.24.dist-info}/METADATA +13 -31
  50. signalpilot_ai_internal-0.11.24.dist-info/RECORD +66 -0
  51. signalpilot_ai_internal-0.11.24.dist-info/licenses/LICENSE +7 -0
  52. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/110.224e83db03814fd03955.js +0 -7
  53. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.e2dadf63dc64d7b5f1ee.js +0 -1
  54. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.328403b5545f268b95c6.js +0 -1
  55. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.726e1da31a50868cb297.js +0 -1
  56. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.972abe1d2d66f083f9cc.js +0 -1
  57. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.dbec4c2dc12e7b050dcc.js +0 -1
  58. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.fa432bdb7fb6b1c95ad6.js +0 -1
  59. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.37e271d7a80336daabe2.js +0 -1
  60. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.ad22ccddd74ee306fb56.js +0 -1
  61. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.73c7a9290b7d35a8b9c1.js +0 -1
  62. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.b58fc0093d080b8ee61c.js +0 -1
  63. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js +0 -2
  64. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/57.c4232851631fb2e7e59a.js +0 -1
  65. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.9720593ee20b768da3ca.js +0 -1
  66. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.8e6edc9a965bdd578ca7.js +0 -1
  67. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/726.318e4e791edb63cc788f.js +0 -1
  68. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.dc49867fafb03ea2ba4d.js +0 -1
  69. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/742.91e7b516c8699eea3373.js +0 -1
  70. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.2d75de1a8d2c3131a8db.js +0 -1
  71. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/801.ca9e114a30896b669a3c.js +0 -1
  72. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/880.d9914229e4f120e7e9e4.js +0 -1
  73. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/888.34054db17bcf6e87ec95.js +0 -1
  74. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/936.d80de1e4da5b520d2f3b.js +0 -1
  75. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.b63c429ca81e743b403c.js +0 -1
  76. signalpilot_ai_internal-0.10.22.dist-info/RECORD +0 -56
  77. signalpilot_ai_internal-0.10.22.dist-info/licenses/LICENSE +0 -29
  78. {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/etc/jupyter/jupyter_server_config.d/signalpilot_ai.json +0 -0
  79. {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/install.json +0 -0
  80. {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/plugin.json +0 -0
  81. /signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js.LICENSE.txt → /signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/956.eeffe67d7781fd63ef4b.js.LICENSE.txt +0 -0
  82. {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/style.js +0 -0
  83. {signalpilot_ai_internal-0.10.22.dist-info → signalpilot_ai_internal-0.11.24.dist-info}/WHEEL +0 -0
@@ -0,0 +1,166 @@
1
+ """
2
+ Database Configuration Service
3
+ Manages database configurations stored in db.toml in the connect/ cache directory
4
+ """
5
+
6
+ import logging
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from .signalpilot_home import get_signalpilot_home
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class DatabaseConfigService:
15
+ """
16
+ Service for managing database configurations in TOML format.
17
+ Configurations stored at <cache_dir>/connect/db.toml
18
+ (e.g., ~/Library/Caches/SignalPilotAI/connect/db.toml on macOS)
19
+ """
20
+
21
+ _instance = None
22
+
23
+ # Supported database types
24
+ SUPPORTED_TYPES = ["snowflake", "postgres", "mysql", "databricks"]
25
+
26
+ def __init__(self):
27
+ self._home_manager = get_signalpilot_home()
28
+
29
+ @classmethod
30
+ def get_instance(cls) -> 'DatabaseConfigService':
31
+ """Get singleton instance."""
32
+ if cls._instance is None:
33
+ cls._instance = DatabaseConfigService()
34
+ return cls._instance
35
+
36
+ def get_all_configs(self) -> List[Dict[str, Any]]:
37
+ """Get all database configurations."""
38
+ return self._home_manager.get_database_configs()
39
+
40
+ def get_config(self, db_type: str, name: str) -> Optional[Dict[str, Any]]:
41
+ """Get a specific database configuration."""
42
+ return self._home_manager.get_database_config(db_type, name)
43
+
44
+ def get_configs_by_type(self, db_type: str) -> List[Dict[str, Any]]:
45
+ """Get all configurations for a specific database type."""
46
+ configs = self.get_all_configs()
47
+ return [c for c in configs if c.get("type") == db_type]
48
+
49
+ def add_config(self, db_type: str, config: Dict[str, Any]) -> bool:
50
+ """Add a new database configuration."""
51
+ if db_type not in self.SUPPORTED_TYPES:
52
+ logger.error(f"Unsupported database type: {db_type}")
53
+ return False
54
+
55
+ if "name" not in config:
56
+ logger.error("Database config must have a 'name' field")
57
+ return False
58
+
59
+ return self._home_manager.add_database_config(db_type, config)
60
+
61
+ def update_config(self, db_type: str, name: str,
62
+ updates: Dict[str, Any]) -> bool:
63
+ """Update an existing database configuration."""
64
+ return self._home_manager.update_database_config(db_type, name, updates)
65
+
66
+ def remove_config(self, db_type: str, name: str) -> bool:
67
+ """Remove a database configuration."""
68
+ return self._home_manager.remove_database_config(db_type, name)
69
+
70
+ def set_defaults(self, defaults: Dict[str, Any]) -> bool:
71
+ """Set global defaults for database configurations."""
72
+ return self._home_manager.set_database_defaults(defaults)
73
+
74
+ def get_defaults(self) -> Dict[str, Any]:
75
+ """Get global defaults."""
76
+ return self._home_manager.get_database_defaults()
77
+
78
+ # ==================== Type-specific helpers ====================
79
+
80
+ def add_snowflake_config(self, name: str, account: str,
81
+ database: str = None,
82
+ warehouse: str = None,
83
+ role: str = None,
84
+ username: str = None,
85
+ password: str = None,
86
+ **extra) -> bool:
87
+ """Add a Snowflake database configuration."""
88
+ config = {
89
+ "name": name,
90
+ "account": account,
91
+ }
92
+ if database:
93
+ config["database"] = database
94
+ if warehouse:
95
+ config["warehouse"] = warehouse
96
+ if role:
97
+ config["role"] = role
98
+ if username:
99
+ config["username"] = username
100
+ if password:
101
+ config["password"] = password
102
+ config.update(extra)
103
+
104
+ return self.add_config("snowflake", config)
105
+
106
+ def add_postgres_config(self, name: str, host: str, port: int,
107
+ database: str, username: str, password: str,
108
+ **extra) -> bool:
109
+ """Add a PostgreSQL database configuration."""
110
+ config = {
111
+ "name": name,
112
+ "host": host,
113
+ "port": port,
114
+ "database": database,
115
+ "username": username,
116
+ "password": password,
117
+ }
118
+ config.update(extra)
119
+
120
+ return self.add_config("postgres", config)
121
+
122
+ def add_mysql_config(self, name: str, host: str, port: int,
123
+ database: str, username: str, password: str,
124
+ **extra) -> bool:
125
+ """Add a MySQL database configuration."""
126
+ config = {
127
+ "name": name,
128
+ "host": host,
129
+ "port": port,
130
+ "database": database,
131
+ "username": username,
132
+ "password": password,
133
+ }
134
+ config.update(extra)
135
+
136
+ return self.add_config("mysql", config)
137
+
138
+ def add_databricks_config(self, name: str, host: str,
139
+ http_path: str, catalog: str,
140
+ auth_type: str = "pat",
141
+ access_token: str = None,
142
+ client_id: str = None,
143
+ client_secret: str = None,
144
+ **extra) -> bool:
145
+ """Add a Databricks database configuration."""
146
+ config = {
147
+ "name": name,
148
+ "host": host,
149
+ "http_path": http_path,
150
+ "catalog": catalog,
151
+ "auth_type": auth_type,
152
+ }
153
+ if access_token:
154
+ config["access_token"] = access_token
155
+ if client_id:
156
+ config["client_id"] = client_id
157
+ if client_secret:
158
+ config["client_secret"] = client_secret
159
+ config.update(extra)
160
+
161
+ return self.add_config("databricks", config)
162
+
163
+
164
+ def get_database_config_service() -> DatabaseConfigService:
165
+ """Get the singleton instance."""
166
+ return DatabaseConfigService.get_instance()
@@ -22,6 +22,7 @@ import threading
22
22
  from jupyter_server.base.handlers import APIHandler
23
23
  import tornado
24
24
 
25
+ from .log_utils import print
25
26
 
26
27
  # In-memory token cache for Service Principal OAuth tokens
27
28
  # Key: connection_id or hash of client credentials
@@ -193,17 +194,18 @@ class DatabricksSchemaHandler(APIHandler):
193
194
  """Build Databricks connection parameters from configuration"""
194
195
  import re
195
196
 
196
- # Extract host from connectionUrl
197
- connection_url = config.get('connectionUrl', '')
197
+ # Extract host - check 'host' first, then fall back to 'connectionUrl' for backwards compatibility
198
+ connection_url = config.get('host') or config.get('connectionUrl', '')
198
199
  if not connection_url:
199
- raise ValueError("connectionUrl (workspace URL) is required for Databricks")
200
+ raise ValueError("host (workspace URL) is required for Databricks")
200
201
 
201
- # Extract host from URL
202
+ # Extract host from URL - support both with and without protocol prefix
202
203
  url_match = re.match(r'https?://([^/]+)', connection_url)
203
- if not url_match:
204
- raise ValueError(f"Invalid Databricks connectionUrl format: {connection_url}")
205
-
206
- server_hostname = url_match.group(1)
204
+ if url_match:
205
+ server_hostname = url_match.group(1)
206
+ else:
207
+ # Assume it's just the hostname without protocol
208
+ server_hostname = connection_url.split('/')[0].strip()
207
209
 
208
210
  # Get HTTP path for SQL warehouse
209
211
  http_path = config.get('warehouseHttpPath') or config.get('httpPath')
@@ -666,15 +668,18 @@ class DatabricksQueryHandler(APIHandler):
666
668
  """Build Databricks connection parameters from configuration"""
667
669
  import re
668
670
 
669
- connection_url = config.get('connectionUrl', '')
671
+ # Extract host - check 'host' first, then fall back to 'connectionUrl' for backwards compatibility
672
+ connection_url = config.get('host') or config.get('connectionUrl', '')
670
673
  if not connection_url:
671
- raise ValueError("connectionUrl (workspace URL) is required for Databricks")
674
+ raise ValueError("host (workspace URL) is required for Databricks")
672
675
 
676
+ # Extract host from URL - support both with and without protocol prefix
673
677
  url_match = re.match(r'https?://([^/]+)', connection_url)
674
- if not url_match:
675
- raise ValueError(f"Invalid Databricks connectionUrl format: {connection_url}")
676
-
677
- server_hostname = url_match.group(1)
678
+ if url_match:
679
+ server_hostname = url_match.group(1)
680
+ else:
681
+ # Assume it's just the hostname without protocol
682
+ server_hostname = connection_url.split('/')[0].strip()
678
683
 
679
684
  http_path = config.get('warehouseHttpPath') or config.get('httpPath')
680
685
  if not http_path:
@@ -19,6 +19,7 @@ import pyarrow.dataset as ds
19
19
  from openpyxl import load_workbook
20
20
 
21
21
  from .cache_service import get_cache_service, get_file_scan_cache_manager
22
+ from .log_utils import print
22
23
 
23
24
 
24
25
  class FileScannerService:
@@ -30,7 +31,7 @@ class FileScannerService:
30
31
  self._lock = threading.RLock()
31
32
 
32
33
  # Data file extensions
33
- self.DATA_EXTENSIONS = {'.csv', '.json', '.xlsx', '.xls', '.parquet', '.pkl', '.pickle',
34
+ self.DATA_EXTENSIONS = {'.csv', '.json', '.xlsx', '.xls', '.parquet',
34
35
  '.feather', '.hdf5', '.h5', '.sql', '.db', '.sqlite', '.tsv', '.txt', '.ipynb'}
35
36
 
36
37
  # Directories to exclude from search
@@ -76,100 +77,6 @@ class FileScannerService:
76
77
  return (printable / len(chunk)) < 0.7
77
78
  except (IOError, OSError):
78
79
  return True
79
-
80
- def _generate_pickle_data_preview(self, data: Any, max_items: int = 3, max_chars: int = 1000) -> Tuple[str, bool]:
81
- """
82
- Generate a content preview for non-DataFrame pickle data.
83
- Returns (preview_content, is_truncated)
84
- """
85
- try:
86
- data_type = type(data).__name__
87
-
88
- if isinstance(data, (list, tuple)):
89
- if len(data) == 0:
90
- return f"Empty {data_type}", False
91
-
92
- preview_items = []
93
- for i, item in enumerate(data[:max_items]):
94
- item_str = str(item)
95
- if len(item_str) > 200:
96
- item_str = item_str[:200] + "..."
97
- preview_items.append(f"[{i}]: {item_str}")
98
-
99
- preview = f"{data_type} with {len(data)} items:\n" + "\n".join(preview_items)
100
- is_truncated = len(data) > max_items
101
-
102
- if len(preview) > max_chars:
103
- preview = preview[:max_chars] + "..."
104
- is_truncated = True
105
-
106
- return preview, is_truncated
107
-
108
- elif isinstance(data, dict):
109
- if len(data) == 0:
110
- return f"Empty {data_type}", False
111
-
112
- preview_items = []
113
- for i, (key, value) in enumerate(list(data.items())[:max_items]):
114
- key_str = str(key)
115
- value_str = str(value)
116
- if len(value_str) > 150:
117
- value_str = value_str[:150] + "..."
118
- preview_items.append(f"'{key_str}': {value_str}")
119
-
120
- preview = f"{data_type} with {len(data)} keys:\n" + "\n".join(preview_items)
121
- is_truncated = len(data) > max_items
122
-
123
- if len(preview) > max_chars:
124
- preview = preview[:max_chars] + "..."
125
- is_truncated = True
126
-
127
- return preview, is_truncated
128
-
129
- elif isinstance(data, np.ndarray):
130
- shape_str = str(data.shape)
131
- dtype_str = str(data.dtype)
132
-
133
- if data.size == 0:
134
- return f"Empty numpy array: shape={shape_str}, dtype={dtype_str}", False
135
-
136
- # Show first few elements
137
- flat_data = data.flatten()[:max_items]
138
- elements_str = ", ".join([str(x) for x in flat_data])
139
-
140
- preview = f"numpy.ndarray: shape={shape_str}, dtype={dtype_str}\nFirst elements: [{elements_str}]"
141
- is_truncated = data.size > max_items
142
-
143
- if len(preview) > max_chars:
144
- preview = preview[:max_chars] + "..."
145
- is_truncated = True
146
-
147
- return preview, is_truncated
148
-
149
- elif isinstance(data, str):
150
- if len(data) == 0:
151
- return "Empty string", False
152
-
153
- preview = f"String ({len(data)} chars): {data[:max_chars]}"
154
- is_truncated = len(data) > max_chars
155
- return preview, is_truncated
156
-
157
- elif isinstance(data, (int, float, bool)):
158
- return f"{data_type}: {data}", False
159
-
160
- else:
161
- # For other types, try to convert to string
162
- data_str = str(data)
163
- if len(data_str) > max_chars:
164
- data_str = data_str[:max_chars] + "..."
165
- is_truncated = True
166
- else:
167
- is_truncated = False
168
-
169
- return f"{data_type}: {data_str}", is_truncated
170
-
171
- except Exception as e:
172
- return f"Error generating preview for {type(data).__name__}: {str(e)}", False
173
80
 
174
81
  def _parse_json_array_simple(self, filepath: str, max_items: int = 5) -> Tuple[List[Any], bool]:
175
82
  """
@@ -613,12 +520,11 @@ class FileScannerService:
613
520
  'is_tsv': extension == '.tsv',
614
521
  'is_json': extension == '.json',
615
522
  'is_parquet': extension == '.parquet',
616
- 'is_pkl': extension in ['.pkl', '.pickle'],
617
523
  'is_xlsx': extension == '.xlsx',
618
524
  'is_ipynb': extension == '.ipynb',
619
525
  'is_text': extension in ['.txt', '.md', '.py', '.js', '.ts', '.html', '.xml', '.ipynb'],
620
- 'is_data': extension in ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.pkl', '.pickle', '.xlsx'],
621
- 'is_binary': extension in ['.parquet', '.pkl', '.pickle', '.xlsx'] # Will be set later based on actual binary detection
526
+ 'is_data': extension in ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.xlsx'],
527
+ 'is_binary': extension in ['.parquet', '.xlsx'] # Will be set later based on actual binary detection
622
528
  }
623
529
 
624
530
  try:
@@ -1121,8 +1027,6 @@ class FileScannerService:
1121
1027
  file_type = 'tsv'
1122
1028
  elif extension == '.parquet':
1123
1029
  file_type = 'parquet'
1124
- elif extension in ['.pkl', '.pickle']:
1125
- file_type = 'pkl'
1126
1030
  elif extension == '.xlsx':
1127
1031
  file_type = 'xlsx'
1128
1032
  elif extension == '.json' or extension == '.jsonl':
@@ -1174,51 +1078,6 @@ class FileScannerService:
1174
1078
  except Exception:
1175
1079
  sheet_names = ['Sheet1'] # Default sheet name
1176
1080
  total_sheets = 1 # Default to 1 if we can't determine
1177
- elif file_type == 'pkl':
1178
- print(f"Reading pickle file: {abs_path}")
1179
- data = pd.read_pickle(abs_path)
1180
- print(f"Data: {data}")
1181
- if isinstance(data, pd.DataFrame):
1182
- print(f"Data is a DataFrame: {data.head(5)}")
1183
- df = data.head(5) # Limit to first 5 rows
1184
- else:
1185
- # Handle non-DataFrame pickle data
1186
- print(f"Data is not a DataFrame: {type(data).__name__}")
1187
-
1188
- # Get file info
1189
- file_info = self._get_file_type_info(str(item), extension)
1190
- entry['file_info'] = file_info
1191
-
1192
- # Check if file is binary (pickle files are always binary)
1193
- is_binary = True
1194
- file_info['is_binary'] = True
1195
-
1196
- # Generate content preview for the pickle data
1197
- content_preview, is_truncated = self._generate_pickle_data_preview(data)
1198
- entry['content_preview'] = content_preview
1199
- entry['is_truncated'] = is_truncated
1200
-
1201
- # Create schema for non-DataFrame pickle data
1202
- schema = {
1203
- 'success': True,
1204
- 'fileId': abs_path,
1205
- 'fileName': item.name,
1206
- 'filePath': abs_path,
1207
- 'fileType': file_type,
1208
- 'extractedAt': datetime.now().isoformat(),
1209
- 'summary': f'Pickle file containing {type(data).__name__}',
1210
- 'columns': [],
1211
- 'totalRows': 1 if not hasattr(data, '__len__') else len(data) if hasattr(data, '__len__') else 1,
1212
- 'totalColumns': 0,
1213
- 'fileMtime': current_mtime
1214
- }
1215
-
1216
- # Cache the entry
1217
- if entry:
1218
- entry['schema'] = schema
1219
- self.file_scan_cache.set_file_entry(abs_path, entry)
1220
-
1221
- return schema
1222
1081
  elif file_type == 'json':
1223
1082
  # Read and analyze JSON file
1224
1083
  json_data, file_format, is_truncated = self._read_json_file(abs_path)
@@ -1273,7 +1132,7 @@ class FileScannerService:
1273
1132
 
1274
1133
  return schema
1275
1134
 
1276
- # Get file info for DataFrame pickle files and other file types
1135
+ # Get file info for other file types
1277
1136
  file_info = self._get_file_type_info(str(item), extension)
1278
1137
  entry['file_info'] = file_info
1279
1138