signalpilot-ai-internal 0.10.22__py3-none-any.whl → 0.11.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalpilot_ai_internal/_version.py +1 -1
- signalpilot_ai_internal/cache_service.py +22 -21
- signalpilot_ai_internal/composio_handlers.py +224 -0
- signalpilot_ai_internal/composio_service.py +511 -0
- signalpilot_ai_internal/database_config_handlers.py +182 -0
- signalpilot_ai_internal/database_config_service.py +166 -0
- signalpilot_ai_internal/databricks_schema_service.py +19 -14
- signalpilot_ai_internal/file_scanner_service.py +5 -146
- signalpilot_ai_internal/handlers.py +317 -8
- signalpilot_ai_internal/integrations_config.py +256 -0
- signalpilot_ai_internal/log_utils.py +31 -0
- signalpilot_ai_internal/mcp_handlers.py +33 -9
- signalpilot_ai_internal/mcp_service.py +94 -142
- signalpilot_ai_internal/oauth_token_store.py +141 -0
- signalpilot_ai_internal/schema_search_config.yml +17 -11
- signalpilot_ai_internal/schema_search_service.py +30 -10
- signalpilot_ai_internal/signalpilot_home.py +961 -0
- signalpilot_ai_internal/snowflake_schema_service.py +2 -0
- signalpilot_ai_internal/unified_database_schema_service.py +2 -0
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig → signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json +15 -48
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json → signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig +9 -52
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.bab318d6caadb055e29c.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/129.868ca665e6fc225c20a0.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/179.fd45a2e75d471d0aa3b9.js +7 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.81105a94aa873fc51a94.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.a002dd4630d3b6404a90.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.cc6f6ecacd703bcdb468.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.817a883549d55a0e0576.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.a4daecd44f1e9364e44a.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.667225aab294fb5ed161.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/447.8138af2522716e5a926f.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.925c73e32f3c07448da0.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/477.aaa4cc9e87801fb45f5b.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.370056149a59022b700c.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/510.868ca665e6fc225c20a0.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.835f97f7ccfc70ff5c93.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/57.6c13335f73de089d6b1e.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/574.ad2709e91ebcac5bbe68.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.bddbab8e464fe31f0393.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.fda1bcdb10497b0a6ade.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.d046701f475fcbf6697d.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.c306dffd4cfe8a613d13.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/801.e39898b6f336539f228c.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/880.77cc0ca10a1860df1b52.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/936.4e2850b2af985ed0d378.js +1 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/956.eeffe67d7781fd63ef4b.js +2 -0
- signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.055f50d20a31f3068c72.js +1 -0
- {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/third-party-licenses.json +29 -29
- {signalpilot_ai_internal-0.10.22.dist-info → signalpilot_ai_internal-0.11.24.dist-info}/METADATA +13 -31
- signalpilot_ai_internal-0.11.24.dist-info/RECORD +66 -0
- signalpilot_ai_internal-0.11.24.dist-info/licenses/LICENSE +7 -0
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/110.224e83db03814fd03955.js +0 -7
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.e2dadf63dc64d7b5f1ee.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.328403b5545f268b95c6.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.726e1da31a50868cb297.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.972abe1d2d66f083f9cc.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.dbec4c2dc12e7b050dcc.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.fa432bdb7fb6b1c95ad6.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.37e271d7a80336daabe2.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.ad22ccddd74ee306fb56.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.73c7a9290b7d35a8b9c1.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.b58fc0093d080b8ee61c.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js +0 -2
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/57.c4232851631fb2e7e59a.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.9720593ee20b768da3ca.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.8e6edc9a965bdd578ca7.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/726.318e4e791edb63cc788f.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.dc49867fafb03ea2ba4d.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/742.91e7b516c8699eea3373.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.2d75de1a8d2c3131a8db.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/801.ca9e114a30896b669a3c.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/880.d9914229e4f120e7e9e4.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/888.34054db17bcf6e87ec95.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/936.d80de1e4da5b520d2f3b.js +0 -1
- signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.b63c429ca81e743b403c.js +0 -1
- signalpilot_ai_internal-0.10.22.dist-info/RECORD +0 -56
- signalpilot_ai_internal-0.10.22.dist-info/licenses/LICENSE +0 -29
- {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/etc/jupyter/jupyter_server_config.d/signalpilot_ai.json +0 -0
- {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/install.json +0 -0
- {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/plugin.json +0 -0
- /signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js.LICENSE.txt → /signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/956.eeffe67d7781fd63ef4b.js.LICENSE.txt +0 -0
- {signalpilot_ai_internal-0.10.22.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/style.js +0 -0
- {signalpilot_ai_internal-0.10.22.dist-info → signalpilot_ai_internal-0.11.24.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database Configuration Service
|
|
3
|
+
Manages database configurations stored in db.toml in the connect/ cache directory
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from .signalpilot_home import get_signalpilot_home
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DatabaseConfigService:
|
|
15
|
+
"""
|
|
16
|
+
Service for managing database configurations in TOML format.
|
|
17
|
+
Configurations stored at <cache_dir>/connect/db.toml
|
|
18
|
+
(e.g., ~/Library/Caches/SignalPilotAI/connect/db.toml on macOS)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
_instance = None
|
|
22
|
+
|
|
23
|
+
# Supported database types
|
|
24
|
+
SUPPORTED_TYPES = ["snowflake", "postgres", "mysql", "databricks"]
|
|
25
|
+
|
|
26
|
+
def __init__(self):
|
|
27
|
+
self._home_manager = get_signalpilot_home()
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def get_instance(cls) -> 'DatabaseConfigService':
|
|
31
|
+
"""Get singleton instance."""
|
|
32
|
+
if cls._instance is None:
|
|
33
|
+
cls._instance = DatabaseConfigService()
|
|
34
|
+
return cls._instance
|
|
35
|
+
|
|
36
|
+
def get_all_configs(self) -> List[Dict[str, Any]]:
|
|
37
|
+
"""Get all database configurations."""
|
|
38
|
+
return self._home_manager.get_database_configs()
|
|
39
|
+
|
|
40
|
+
def get_config(self, db_type: str, name: str) -> Optional[Dict[str, Any]]:
|
|
41
|
+
"""Get a specific database configuration."""
|
|
42
|
+
return self._home_manager.get_database_config(db_type, name)
|
|
43
|
+
|
|
44
|
+
def get_configs_by_type(self, db_type: str) -> List[Dict[str, Any]]:
|
|
45
|
+
"""Get all configurations for a specific database type."""
|
|
46
|
+
configs = self.get_all_configs()
|
|
47
|
+
return [c for c in configs if c.get("type") == db_type]
|
|
48
|
+
|
|
49
|
+
def add_config(self, db_type: str, config: Dict[str, Any]) -> bool:
|
|
50
|
+
"""Add a new database configuration."""
|
|
51
|
+
if db_type not in self.SUPPORTED_TYPES:
|
|
52
|
+
logger.error(f"Unsupported database type: {db_type}")
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
if "name" not in config:
|
|
56
|
+
logger.error("Database config must have a 'name' field")
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
return self._home_manager.add_database_config(db_type, config)
|
|
60
|
+
|
|
61
|
+
def update_config(self, db_type: str, name: str,
|
|
62
|
+
updates: Dict[str, Any]) -> bool:
|
|
63
|
+
"""Update an existing database configuration."""
|
|
64
|
+
return self._home_manager.update_database_config(db_type, name, updates)
|
|
65
|
+
|
|
66
|
+
def remove_config(self, db_type: str, name: str) -> bool:
|
|
67
|
+
"""Remove a database configuration."""
|
|
68
|
+
return self._home_manager.remove_database_config(db_type, name)
|
|
69
|
+
|
|
70
|
+
def set_defaults(self, defaults: Dict[str, Any]) -> bool:
|
|
71
|
+
"""Set global defaults for database configurations."""
|
|
72
|
+
return self._home_manager.set_database_defaults(defaults)
|
|
73
|
+
|
|
74
|
+
def get_defaults(self) -> Dict[str, Any]:
|
|
75
|
+
"""Get global defaults."""
|
|
76
|
+
return self._home_manager.get_database_defaults()
|
|
77
|
+
|
|
78
|
+
# ==================== Type-specific helpers ====================
|
|
79
|
+
|
|
80
|
+
def add_snowflake_config(self, name: str, account: str,
|
|
81
|
+
database: str = None,
|
|
82
|
+
warehouse: str = None,
|
|
83
|
+
role: str = None,
|
|
84
|
+
username: str = None,
|
|
85
|
+
password: str = None,
|
|
86
|
+
**extra) -> bool:
|
|
87
|
+
"""Add a Snowflake database configuration."""
|
|
88
|
+
config = {
|
|
89
|
+
"name": name,
|
|
90
|
+
"account": account,
|
|
91
|
+
}
|
|
92
|
+
if database:
|
|
93
|
+
config["database"] = database
|
|
94
|
+
if warehouse:
|
|
95
|
+
config["warehouse"] = warehouse
|
|
96
|
+
if role:
|
|
97
|
+
config["role"] = role
|
|
98
|
+
if username:
|
|
99
|
+
config["username"] = username
|
|
100
|
+
if password:
|
|
101
|
+
config["password"] = password
|
|
102
|
+
config.update(extra)
|
|
103
|
+
|
|
104
|
+
return self.add_config("snowflake", config)
|
|
105
|
+
|
|
106
|
+
def add_postgres_config(self, name: str, host: str, port: int,
|
|
107
|
+
database: str, username: str, password: str,
|
|
108
|
+
**extra) -> bool:
|
|
109
|
+
"""Add a PostgreSQL database configuration."""
|
|
110
|
+
config = {
|
|
111
|
+
"name": name,
|
|
112
|
+
"host": host,
|
|
113
|
+
"port": port,
|
|
114
|
+
"database": database,
|
|
115
|
+
"username": username,
|
|
116
|
+
"password": password,
|
|
117
|
+
}
|
|
118
|
+
config.update(extra)
|
|
119
|
+
|
|
120
|
+
return self.add_config("postgres", config)
|
|
121
|
+
|
|
122
|
+
def add_mysql_config(self, name: str, host: str, port: int,
|
|
123
|
+
database: str, username: str, password: str,
|
|
124
|
+
**extra) -> bool:
|
|
125
|
+
"""Add a MySQL database configuration."""
|
|
126
|
+
config = {
|
|
127
|
+
"name": name,
|
|
128
|
+
"host": host,
|
|
129
|
+
"port": port,
|
|
130
|
+
"database": database,
|
|
131
|
+
"username": username,
|
|
132
|
+
"password": password,
|
|
133
|
+
}
|
|
134
|
+
config.update(extra)
|
|
135
|
+
|
|
136
|
+
return self.add_config("mysql", config)
|
|
137
|
+
|
|
138
|
+
def add_databricks_config(self, name: str, host: str,
|
|
139
|
+
http_path: str, catalog: str,
|
|
140
|
+
auth_type: str = "pat",
|
|
141
|
+
access_token: str = None,
|
|
142
|
+
client_id: str = None,
|
|
143
|
+
client_secret: str = None,
|
|
144
|
+
**extra) -> bool:
|
|
145
|
+
"""Add a Databricks database configuration."""
|
|
146
|
+
config = {
|
|
147
|
+
"name": name,
|
|
148
|
+
"host": host,
|
|
149
|
+
"http_path": http_path,
|
|
150
|
+
"catalog": catalog,
|
|
151
|
+
"auth_type": auth_type,
|
|
152
|
+
}
|
|
153
|
+
if access_token:
|
|
154
|
+
config["access_token"] = access_token
|
|
155
|
+
if client_id:
|
|
156
|
+
config["client_id"] = client_id
|
|
157
|
+
if client_secret:
|
|
158
|
+
config["client_secret"] = client_secret
|
|
159
|
+
config.update(extra)
|
|
160
|
+
|
|
161
|
+
return self.add_config("databricks", config)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def get_database_config_service() -> DatabaseConfigService:
|
|
165
|
+
"""Get the singleton instance."""
|
|
166
|
+
return DatabaseConfigService.get_instance()
|
|
@@ -22,6 +22,7 @@ import threading
|
|
|
22
22
|
from jupyter_server.base.handlers import APIHandler
|
|
23
23
|
import tornado
|
|
24
24
|
|
|
25
|
+
from .log_utils import print
|
|
25
26
|
|
|
26
27
|
# In-memory token cache for Service Principal OAuth tokens
|
|
27
28
|
# Key: connection_id or hash of client credentials
|
|
@@ -193,17 +194,18 @@ class DatabricksSchemaHandler(APIHandler):
|
|
|
193
194
|
"""Build Databricks connection parameters from configuration"""
|
|
194
195
|
import re
|
|
195
196
|
|
|
196
|
-
# Extract host
|
|
197
|
-
connection_url = config.get('connectionUrl', '')
|
|
197
|
+
# Extract host - check 'host' first, then fall back to 'connectionUrl' for backwards compatibility
|
|
198
|
+
connection_url = config.get('host') or config.get('connectionUrl', '')
|
|
198
199
|
if not connection_url:
|
|
199
|
-
raise ValueError("
|
|
200
|
+
raise ValueError("host (workspace URL) is required for Databricks")
|
|
200
201
|
|
|
201
|
-
# Extract host from URL
|
|
202
|
+
# Extract host from URL - support both with and without protocol prefix
|
|
202
203
|
url_match = re.match(r'https?://([^/]+)', connection_url)
|
|
203
|
-
if
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
204
|
+
if url_match:
|
|
205
|
+
server_hostname = url_match.group(1)
|
|
206
|
+
else:
|
|
207
|
+
# Assume it's just the hostname without protocol
|
|
208
|
+
server_hostname = connection_url.split('/')[0].strip()
|
|
207
209
|
|
|
208
210
|
# Get HTTP path for SQL warehouse
|
|
209
211
|
http_path = config.get('warehouseHttpPath') or config.get('httpPath')
|
|
@@ -666,15 +668,18 @@ class DatabricksQueryHandler(APIHandler):
|
|
|
666
668
|
"""Build Databricks connection parameters from configuration"""
|
|
667
669
|
import re
|
|
668
670
|
|
|
669
|
-
|
|
671
|
+
# Extract host - check 'host' first, then fall back to 'connectionUrl' for backwards compatibility
|
|
672
|
+
connection_url = config.get('host') or config.get('connectionUrl', '')
|
|
670
673
|
if not connection_url:
|
|
671
|
-
raise ValueError("
|
|
674
|
+
raise ValueError("host (workspace URL) is required for Databricks")
|
|
672
675
|
|
|
676
|
+
# Extract host from URL - support both with and without protocol prefix
|
|
673
677
|
url_match = re.match(r'https?://([^/]+)', connection_url)
|
|
674
|
-
if
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
+
if url_match:
|
|
679
|
+
server_hostname = url_match.group(1)
|
|
680
|
+
else:
|
|
681
|
+
# Assume it's just the hostname without protocol
|
|
682
|
+
server_hostname = connection_url.split('/')[0].strip()
|
|
678
683
|
|
|
679
684
|
http_path = config.get('warehouseHttpPath') or config.get('httpPath')
|
|
680
685
|
if not http_path:
|
|
@@ -19,6 +19,7 @@ import pyarrow.dataset as ds
|
|
|
19
19
|
from openpyxl import load_workbook
|
|
20
20
|
|
|
21
21
|
from .cache_service import get_cache_service, get_file_scan_cache_manager
|
|
22
|
+
from .log_utils import print
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class FileScannerService:
|
|
@@ -30,7 +31,7 @@ class FileScannerService:
|
|
|
30
31
|
self._lock = threading.RLock()
|
|
31
32
|
|
|
32
33
|
# Data file extensions
|
|
33
|
-
self.DATA_EXTENSIONS = {'.csv', '.json', '.xlsx', '.xls', '.parquet',
|
|
34
|
+
self.DATA_EXTENSIONS = {'.csv', '.json', '.xlsx', '.xls', '.parquet',
|
|
34
35
|
'.feather', '.hdf5', '.h5', '.sql', '.db', '.sqlite', '.tsv', '.txt', '.ipynb'}
|
|
35
36
|
|
|
36
37
|
# Directories to exclude from search
|
|
@@ -76,100 +77,6 @@ class FileScannerService:
|
|
|
76
77
|
return (printable / len(chunk)) < 0.7
|
|
77
78
|
except (IOError, OSError):
|
|
78
79
|
return True
|
|
79
|
-
|
|
80
|
-
def _generate_pickle_data_preview(self, data: Any, max_items: int = 3, max_chars: int = 1000) -> Tuple[str, bool]:
|
|
81
|
-
"""
|
|
82
|
-
Generate a content preview for non-DataFrame pickle data.
|
|
83
|
-
Returns (preview_content, is_truncated)
|
|
84
|
-
"""
|
|
85
|
-
try:
|
|
86
|
-
data_type = type(data).__name__
|
|
87
|
-
|
|
88
|
-
if isinstance(data, (list, tuple)):
|
|
89
|
-
if len(data) == 0:
|
|
90
|
-
return f"Empty {data_type}", False
|
|
91
|
-
|
|
92
|
-
preview_items = []
|
|
93
|
-
for i, item in enumerate(data[:max_items]):
|
|
94
|
-
item_str = str(item)
|
|
95
|
-
if len(item_str) > 200:
|
|
96
|
-
item_str = item_str[:200] + "..."
|
|
97
|
-
preview_items.append(f"[{i}]: {item_str}")
|
|
98
|
-
|
|
99
|
-
preview = f"{data_type} with {len(data)} items:\n" + "\n".join(preview_items)
|
|
100
|
-
is_truncated = len(data) > max_items
|
|
101
|
-
|
|
102
|
-
if len(preview) > max_chars:
|
|
103
|
-
preview = preview[:max_chars] + "..."
|
|
104
|
-
is_truncated = True
|
|
105
|
-
|
|
106
|
-
return preview, is_truncated
|
|
107
|
-
|
|
108
|
-
elif isinstance(data, dict):
|
|
109
|
-
if len(data) == 0:
|
|
110
|
-
return f"Empty {data_type}", False
|
|
111
|
-
|
|
112
|
-
preview_items = []
|
|
113
|
-
for i, (key, value) in enumerate(list(data.items())[:max_items]):
|
|
114
|
-
key_str = str(key)
|
|
115
|
-
value_str = str(value)
|
|
116
|
-
if len(value_str) > 150:
|
|
117
|
-
value_str = value_str[:150] + "..."
|
|
118
|
-
preview_items.append(f"'{key_str}': {value_str}")
|
|
119
|
-
|
|
120
|
-
preview = f"{data_type} with {len(data)} keys:\n" + "\n".join(preview_items)
|
|
121
|
-
is_truncated = len(data) > max_items
|
|
122
|
-
|
|
123
|
-
if len(preview) > max_chars:
|
|
124
|
-
preview = preview[:max_chars] + "..."
|
|
125
|
-
is_truncated = True
|
|
126
|
-
|
|
127
|
-
return preview, is_truncated
|
|
128
|
-
|
|
129
|
-
elif isinstance(data, np.ndarray):
|
|
130
|
-
shape_str = str(data.shape)
|
|
131
|
-
dtype_str = str(data.dtype)
|
|
132
|
-
|
|
133
|
-
if data.size == 0:
|
|
134
|
-
return f"Empty numpy array: shape={shape_str}, dtype={dtype_str}", False
|
|
135
|
-
|
|
136
|
-
# Show first few elements
|
|
137
|
-
flat_data = data.flatten()[:max_items]
|
|
138
|
-
elements_str = ", ".join([str(x) for x in flat_data])
|
|
139
|
-
|
|
140
|
-
preview = f"numpy.ndarray: shape={shape_str}, dtype={dtype_str}\nFirst elements: [{elements_str}]"
|
|
141
|
-
is_truncated = data.size > max_items
|
|
142
|
-
|
|
143
|
-
if len(preview) > max_chars:
|
|
144
|
-
preview = preview[:max_chars] + "..."
|
|
145
|
-
is_truncated = True
|
|
146
|
-
|
|
147
|
-
return preview, is_truncated
|
|
148
|
-
|
|
149
|
-
elif isinstance(data, str):
|
|
150
|
-
if len(data) == 0:
|
|
151
|
-
return "Empty string", False
|
|
152
|
-
|
|
153
|
-
preview = f"String ({len(data)} chars): {data[:max_chars]}"
|
|
154
|
-
is_truncated = len(data) > max_chars
|
|
155
|
-
return preview, is_truncated
|
|
156
|
-
|
|
157
|
-
elif isinstance(data, (int, float, bool)):
|
|
158
|
-
return f"{data_type}: {data}", False
|
|
159
|
-
|
|
160
|
-
else:
|
|
161
|
-
# For other types, try to convert to string
|
|
162
|
-
data_str = str(data)
|
|
163
|
-
if len(data_str) > max_chars:
|
|
164
|
-
data_str = data_str[:max_chars] + "..."
|
|
165
|
-
is_truncated = True
|
|
166
|
-
else:
|
|
167
|
-
is_truncated = False
|
|
168
|
-
|
|
169
|
-
return f"{data_type}: {data_str}", is_truncated
|
|
170
|
-
|
|
171
|
-
except Exception as e:
|
|
172
|
-
return f"Error generating preview for {type(data).__name__}: {str(e)}", False
|
|
173
80
|
|
|
174
81
|
def _parse_json_array_simple(self, filepath: str, max_items: int = 5) -> Tuple[List[Any], bool]:
|
|
175
82
|
"""
|
|
@@ -613,12 +520,11 @@ class FileScannerService:
|
|
|
613
520
|
'is_tsv': extension == '.tsv',
|
|
614
521
|
'is_json': extension == '.json',
|
|
615
522
|
'is_parquet': extension == '.parquet',
|
|
616
|
-
'is_pkl': extension in ['.pkl', '.pickle'],
|
|
617
523
|
'is_xlsx': extension == '.xlsx',
|
|
618
524
|
'is_ipynb': extension == '.ipynb',
|
|
619
525
|
'is_text': extension in ['.txt', '.md', '.py', '.js', '.ts', '.html', '.xml', '.ipynb'],
|
|
620
|
-
'is_data': extension in ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.
|
|
621
|
-
'is_binary': extension in ['.parquet', '.
|
|
526
|
+
'is_data': extension in ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.xlsx'],
|
|
527
|
+
'is_binary': extension in ['.parquet', '.xlsx'] # Will be set later based on actual binary detection
|
|
622
528
|
}
|
|
623
529
|
|
|
624
530
|
try:
|
|
@@ -1121,8 +1027,6 @@ class FileScannerService:
|
|
|
1121
1027
|
file_type = 'tsv'
|
|
1122
1028
|
elif extension == '.parquet':
|
|
1123
1029
|
file_type = 'parquet'
|
|
1124
|
-
elif extension in ['.pkl', '.pickle']:
|
|
1125
|
-
file_type = 'pkl'
|
|
1126
1030
|
elif extension == '.xlsx':
|
|
1127
1031
|
file_type = 'xlsx'
|
|
1128
1032
|
elif extension == '.json' or extension == '.jsonl':
|
|
@@ -1174,51 +1078,6 @@ class FileScannerService:
|
|
|
1174
1078
|
except Exception:
|
|
1175
1079
|
sheet_names = ['Sheet1'] # Default sheet name
|
|
1176
1080
|
total_sheets = 1 # Default to 1 if we can't determine
|
|
1177
|
-
elif file_type == 'pkl':
|
|
1178
|
-
print(f"Reading pickle file: {abs_path}")
|
|
1179
|
-
data = pd.read_pickle(abs_path)
|
|
1180
|
-
print(f"Data: {data}")
|
|
1181
|
-
if isinstance(data, pd.DataFrame):
|
|
1182
|
-
print(f"Data is a DataFrame: {data.head(5)}")
|
|
1183
|
-
df = data.head(5) # Limit to first 5 rows
|
|
1184
|
-
else:
|
|
1185
|
-
# Handle non-DataFrame pickle data
|
|
1186
|
-
print(f"Data is not a DataFrame: {type(data).__name__}")
|
|
1187
|
-
|
|
1188
|
-
# Get file info
|
|
1189
|
-
file_info = self._get_file_type_info(str(item), extension)
|
|
1190
|
-
entry['file_info'] = file_info
|
|
1191
|
-
|
|
1192
|
-
# Check if file is binary (pickle files are always binary)
|
|
1193
|
-
is_binary = True
|
|
1194
|
-
file_info['is_binary'] = True
|
|
1195
|
-
|
|
1196
|
-
# Generate content preview for the pickle data
|
|
1197
|
-
content_preview, is_truncated = self._generate_pickle_data_preview(data)
|
|
1198
|
-
entry['content_preview'] = content_preview
|
|
1199
|
-
entry['is_truncated'] = is_truncated
|
|
1200
|
-
|
|
1201
|
-
# Create schema for non-DataFrame pickle data
|
|
1202
|
-
schema = {
|
|
1203
|
-
'success': True,
|
|
1204
|
-
'fileId': abs_path,
|
|
1205
|
-
'fileName': item.name,
|
|
1206
|
-
'filePath': abs_path,
|
|
1207
|
-
'fileType': file_type,
|
|
1208
|
-
'extractedAt': datetime.now().isoformat(),
|
|
1209
|
-
'summary': f'Pickle file containing {type(data).__name__}',
|
|
1210
|
-
'columns': [],
|
|
1211
|
-
'totalRows': 1 if not hasattr(data, '__len__') else len(data) if hasattr(data, '__len__') else 1,
|
|
1212
|
-
'totalColumns': 0,
|
|
1213
|
-
'fileMtime': current_mtime
|
|
1214
|
-
}
|
|
1215
|
-
|
|
1216
|
-
# Cache the entry
|
|
1217
|
-
if entry:
|
|
1218
|
-
entry['schema'] = schema
|
|
1219
|
-
self.file_scan_cache.set_file_entry(abs_path, entry)
|
|
1220
|
-
|
|
1221
|
-
return schema
|
|
1222
1081
|
elif file_type == 'json':
|
|
1223
1082
|
# Read and analyze JSON file
|
|
1224
1083
|
json_data, file_format, is_truncated = self._read_json_file(abs_path)
|
|
@@ -1273,7 +1132,7 @@ class FileScannerService:
|
|
|
1273
1132
|
|
|
1274
1133
|
return schema
|
|
1275
1134
|
|
|
1276
|
-
# Get file info for
|
|
1135
|
+
# Get file info for other file types
|
|
1277
1136
|
file_info = self._get_file_type_info(str(item), extension)
|
|
1278
1137
|
entry['file_info'] = file_info
|
|
1279
1138
|
|