stratifyai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +5 -0
- cli/stratifyai_cli.py +1753 -0
- stratifyai/__init__.py +113 -0
- stratifyai/api_key_helper.py +372 -0
- stratifyai/caching.py +279 -0
- stratifyai/chat/__init__.py +54 -0
- stratifyai/chat/builder.py +366 -0
- stratifyai/chat/stratifyai_anthropic.py +194 -0
- stratifyai/chat/stratifyai_bedrock.py +200 -0
- stratifyai/chat/stratifyai_deepseek.py +194 -0
- stratifyai/chat/stratifyai_google.py +194 -0
- stratifyai/chat/stratifyai_grok.py +194 -0
- stratifyai/chat/stratifyai_groq.py +195 -0
- stratifyai/chat/stratifyai_ollama.py +201 -0
- stratifyai/chat/stratifyai_openai.py +209 -0
- stratifyai/chat/stratifyai_openrouter.py +201 -0
- stratifyai/chunking.py +158 -0
- stratifyai/client.py +292 -0
- stratifyai/config.py +1273 -0
- stratifyai/cost_tracker.py +257 -0
- stratifyai/embeddings.py +245 -0
- stratifyai/exceptions.py +91 -0
- stratifyai/models.py +59 -0
- stratifyai/providers/__init__.py +5 -0
- stratifyai/providers/anthropic.py +330 -0
- stratifyai/providers/base.py +183 -0
- stratifyai/providers/bedrock.py +634 -0
- stratifyai/providers/deepseek.py +39 -0
- stratifyai/providers/google.py +39 -0
- stratifyai/providers/grok.py +39 -0
- stratifyai/providers/groq.py +39 -0
- stratifyai/providers/ollama.py +43 -0
- stratifyai/providers/openai.py +344 -0
- stratifyai/providers/openai_compatible.py +372 -0
- stratifyai/providers/openrouter.py +39 -0
- stratifyai/py.typed +2 -0
- stratifyai/rag.py +381 -0
- stratifyai/retry.py +185 -0
- stratifyai/router.py +643 -0
- stratifyai/summarization.py +179 -0
- stratifyai/utils/__init__.py +11 -0
- stratifyai/utils/bedrock_validator.py +136 -0
- stratifyai/utils/code_extractor.py +327 -0
- stratifyai/utils/csv_extractor.py +197 -0
- stratifyai/utils/file_analyzer.py +192 -0
- stratifyai/utils/json_extractor.py +219 -0
- stratifyai/utils/log_extractor.py +267 -0
- stratifyai/utils/model_selector.py +324 -0
- stratifyai/utils/provider_validator.py +442 -0
- stratifyai/utils/token_counter.py +186 -0
- stratifyai/vectordb.py +344 -0
- stratifyai-0.1.0.dist-info/METADATA +263 -0
- stratifyai-0.1.0.dist-info/RECORD +57 -0
- stratifyai-0.1.0.dist-info/WHEEL +5 -0
- stratifyai-0.1.0.dist-info/entry_points.txt +2 -0
- stratifyai-0.1.0.dist-info/licenses/LICENSE +21 -0
- stratifyai-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""CSV/DataFrame schema extraction for intelligent file analysis.
|
|
2
|
+
|
|
3
|
+
This module extracts compact schema information from CSV files to reduce
|
|
4
|
+
token usage by 99%+ while preserving essential structure information.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List, Optional, Any
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ColumnSchema:
|
|
15
|
+
"""Schema information for a single column."""
|
|
16
|
+
name: str
|
|
17
|
+
dtype: str
|
|
18
|
+
null_count: int
|
|
19
|
+
null_percentage: float
|
|
20
|
+
unique_count: int
|
|
21
|
+
sample_values: List[Any]
|
|
22
|
+
numeric_stats: Optional[Dict[str, float]] = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class CSVSchema:
|
|
27
|
+
"""Complete schema information for a CSV file."""
|
|
28
|
+
file_path: str
|
|
29
|
+
row_count: int
|
|
30
|
+
column_count: int
|
|
31
|
+
columns: List[ColumnSchema]
|
|
32
|
+
memory_usage_mb: float
|
|
33
|
+
|
|
34
|
+
def to_text(self) -> str:
|
|
35
|
+
"""Convert schema to human-readable text representation.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Formatted schema description
|
|
39
|
+
"""
|
|
40
|
+
lines = [
|
|
41
|
+
f"CSV File: {self.file_path}",
|
|
42
|
+
f"Dimensions: {self.row_count:,} rows × {self.column_count} columns",
|
|
43
|
+
f"Memory: {self.memory_usage_mb:.2f} MB",
|
|
44
|
+
"",
|
|
45
|
+
"Column Schema:"
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
for col in self.columns:
|
|
49
|
+
# Basic info
|
|
50
|
+
lines.append(f"\n {col.name} ({col.dtype})")
|
|
51
|
+
lines.append(f" - Null: {col.null_count:,} ({col.null_percentage:.1f}%)")
|
|
52
|
+
lines.append(f" - Unique: {col.unique_count:,}")
|
|
53
|
+
|
|
54
|
+
# Numeric stats if available
|
|
55
|
+
if col.numeric_stats:
|
|
56
|
+
stats = col.numeric_stats
|
|
57
|
+
lines.append(f" - Range: {stats['min']:.2f} to {stats['max']:.2f}")
|
|
58
|
+
lines.append(f" - Mean: {stats['mean']:.2f}, Median: {stats['median']:.2f}")
|
|
59
|
+
lines.append(f" - Std: {stats['std']:.2f}")
|
|
60
|
+
|
|
61
|
+
# Sample values
|
|
62
|
+
samples_str = ", ".join(str(v) for v in col.sample_values[:5])
|
|
63
|
+
lines.append(f" - Samples: {samples_str}")
|
|
64
|
+
|
|
65
|
+
return "\n".join(lines)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def extract_csv_schema(
|
|
69
|
+
file_path: Path,
|
|
70
|
+
sample_size: int = 5,
|
|
71
|
+
max_rows: Optional[int] = None
|
|
72
|
+
) -> CSVSchema:
|
|
73
|
+
"""Extract schema information from a CSV file.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
file_path: Path to CSV file
|
|
77
|
+
sample_size: Number of sample values to extract per column
|
|
78
|
+
max_rows: Maximum number of rows to read (None = all)
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
CSVSchema object with extracted information
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
FileNotFoundError: If file doesn't exist
|
|
85
|
+
pd.errors.EmptyDataError: If CSV is empty
|
|
86
|
+
pd.errors.ParserError: If CSV is malformed
|
|
87
|
+
"""
|
|
88
|
+
if not file_path.exists():
|
|
89
|
+
raise FileNotFoundError(f"CSV file not found: {file_path}")
|
|
90
|
+
|
|
91
|
+
# Read CSV
|
|
92
|
+
df = pd.read_csv(file_path, nrows=max_rows)
|
|
93
|
+
|
|
94
|
+
if df.empty:
|
|
95
|
+
raise pd.errors.EmptyDataError(f"CSV file is empty: {file_path}")
|
|
96
|
+
|
|
97
|
+
# Extract column schemas
|
|
98
|
+
columns = []
|
|
99
|
+
for col_name in df.columns:
|
|
100
|
+
col_data = df[col_name]
|
|
101
|
+
|
|
102
|
+
# Basic stats
|
|
103
|
+
null_count = col_data.isna().sum()
|
|
104
|
+
null_pct = (null_count / len(df)) * 100
|
|
105
|
+
unique_count = col_data.nunique()
|
|
106
|
+
|
|
107
|
+
# Sample values (exclude nulls)
|
|
108
|
+
non_null_values = col_data.dropna()
|
|
109
|
+
if len(non_null_values) > 0:
|
|
110
|
+
sample_values = non_null_values.sample(
|
|
111
|
+
min(sample_size, len(non_null_values)),
|
|
112
|
+
random_state=42
|
|
113
|
+
).tolist()
|
|
114
|
+
else:
|
|
115
|
+
sample_values = []
|
|
116
|
+
|
|
117
|
+
# Numeric statistics if applicable
|
|
118
|
+
numeric_stats = None
|
|
119
|
+
if pd.api.types.is_numeric_dtype(col_data):
|
|
120
|
+
try:
|
|
121
|
+
numeric_stats = {
|
|
122
|
+
'min': float(col_data.min()),
|
|
123
|
+
'max': float(col_data.max()),
|
|
124
|
+
'mean': float(col_data.mean()),
|
|
125
|
+
'median': float(col_data.median()),
|
|
126
|
+
'std': float(col_data.std())
|
|
127
|
+
}
|
|
128
|
+
except (ValueError, TypeError):
|
|
129
|
+
# Handle edge cases (e.g., all NaN)
|
|
130
|
+
pass
|
|
131
|
+
|
|
132
|
+
columns.append(ColumnSchema(
|
|
133
|
+
name=col_name,
|
|
134
|
+
dtype=str(col_data.dtype),
|
|
135
|
+
null_count=int(null_count),
|
|
136
|
+
null_percentage=float(null_pct),
|
|
137
|
+
unique_count=int(unique_count),
|
|
138
|
+
sample_values=sample_values,
|
|
139
|
+
numeric_stats=numeric_stats
|
|
140
|
+
))
|
|
141
|
+
|
|
142
|
+
# Memory usage
|
|
143
|
+
memory_bytes = df.memory_usage(deep=True).sum()
|
|
144
|
+
memory_mb = memory_bytes / (1024 * 1024)
|
|
145
|
+
|
|
146
|
+
return CSVSchema(
|
|
147
|
+
file_path=str(file_path),
|
|
148
|
+
row_count=len(df),
|
|
149
|
+
column_count=len(df.columns),
|
|
150
|
+
columns=columns,
|
|
151
|
+
memory_usage_mb=memory_mb
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def estimate_token_reduction(original_size: int, schema_size: int) -> float:
|
|
156
|
+
"""Estimate token reduction percentage.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
original_size: Size of original CSV in characters
|
|
160
|
+
schema_size: Size of extracted schema in characters
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Reduction percentage (0-100)
|
|
164
|
+
"""
|
|
165
|
+
if original_size == 0:
|
|
166
|
+
return 0.0
|
|
167
|
+
|
|
168
|
+
reduction = ((original_size - schema_size) / original_size) * 100
|
|
169
|
+
return max(0.0, min(100.0, reduction))
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def analyze_csv_file(file_path: Path) -> Dict[str, Any]:
|
|
173
|
+
"""Analyze a CSV file and return comprehensive information.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
file_path: Path to CSV file
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Dictionary with schema and metadata
|
|
180
|
+
"""
|
|
181
|
+
schema = extract_csv_schema(file_path)
|
|
182
|
+
schema_text = schema.to_text()
|
|
183
|
+
|
|
184
|
+
# Calculate original size
|
|
185
|
+
original_size = file_path.stat().st_size
|
|
186
|
+
schema_size = len(schema_text)
|
|
187
|
+
|
|
188
|
+
reduction = estimate_token_reduction(original_size, schema_size)
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
'schema': schema,
|
|
192
|
+
'schema_text': schema_text,
|
|
193
|
+
'original_size_bytes': original_size,
|
|
194
|
+
'schema_size_bytes': schema_size,
|
|
195
|
+
'token_reduction_pct': reduction,
|
|
196
|
+
'recommended_action': 'Use schema for LLM analysis instead of full CSV'
|
|
197
|
+
}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""File analysis utilities for detecting file types and estimating token usage."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from enum import Enum
|
|
7
|
+
|
|
8
|
+
from .token_counter import estimate_tokens, check_token_limit
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FileType(Enum):
|
|
12
|
+
"""Supported file types for intelligent processing."""
|
|
13
|
+
CSV = "csv"
|
|
14
|
+
JSON = "json"
|
|
15
|
+
LOG = "log"
|
|
16
|
+
PYTHON = "python"
|
|
17
|
+
JAVASCRIPT = "javascript"
|
|
18
|
+
JAVA = "java"
|
|
19
|
+
GO = "go"
|
|
20
|
+
TEXT = "text"
|
|
21
|
+
MARKDOWN = "markdown"
|
|
22
|
+
UNKNOWN = "unknown"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class FileAnalysis:
|
|
27
|
+
"""Results of file analysis."""
|
|
28
|
+
file_path: Path
|
|
29
|
+
file_type: FileType
|
|
30
|
+
file_size_bytes: int
|
|
31
|
+
file_size_mb: float
|
|
32
|
+
content_length: int
|
|
33
|
+
estimated_tokens: int
|
|
34
|
+
exceeds_threshold: bool
|
|
35
|
+
context_window: int
|
|
36
|
+
percentage_used: float
|
|
37
|
+
recommendation: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def detect_file_type(file_path: Path) -> FileType:
|
|
41
|
+
"""
|
|
42
|
+
Detect the type of file based on extension and content.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
file_path: Path to the file
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
FileType enum value
|
|
49
|
+
"""
|
|
50
|
+
suffix = file_path.suffix.lower()
|
|
51
|
+
|
|
52
|
+
# Map file extensions to types
|
|
53
|
+
type_mapping = {
|
|
54
|
+
".csv": FileType.CSV,
|
|
55
|
+
".json": FileType.JSON,
|
|
56
|
+
".log": FileType.LOG,
|
|
57
|
+
".py": FileType.PYTHON,
|
|
58
|
+
".js": FileType.JAVASCRIPT,
|
|
59
|
+
".ts": FileType.JAVASCRIPT,
|
|
60
|
+
".jsx": FileType.JAVASCRIPT,
|
|
61
|
+
".tsx": FileType.JAVASCRIPT,
|
|
62
|
+
".java": FileType.JAVA,
|
|
63
|
+
".go": FileType.GO,
|
|
64
|
+
".txt": FileType.TEXT,
|
|
65
|
+
".md": FileType.MARKDOWN,
|
|
66
|
+
".markdown": FileType.MARKDOWN,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return type_mapping.get(suffix, FileType.UNKNOWN)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_recommendation(
|
|
73
|
+
file_type: FileType,
|
|
74
|
+
estimated_tokens: int,
|
|
75
|
+
context_window: int,
|
|
76
|
+
percentage_used: float
|
|
77
|
+
) -> str:
|
|
78
|
+
"""
|
|
79
|
+
Get processing recommendation based on file analysis.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
file_type: Detected file type
|
|
83
|
+
estimated_tokens: Estimated token count
|
|
84
|
+
context_window: Model's context window
|
|
85
|
+
percentage_used: Percentage of context window used
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Recommendation string
|
|
89
|
+
"""
|
|
90
|
+
# File fits comfortably in context
|
|
91
|
+
if percentage_used < 0.5:
|
|
92
|
+
return "✓ File fits well in model context - direct upload recommended"
|
|
93
|
+
|
|
94
|
+
# File approaching context limit
|
|
95
|
+
elif percentage_used < 0.8:
|
|
96
|
+
return "⚠ File uses >50% of context - consider chunking for better performance"
|
|
97
|
+
|
|
98
|
+
# File exceeds safe threshold
|
|
99
|
+
else:
|
|
100
|
+
if file_type == FileType.CSV:
|
|
101
|
+
return "⚠ Large CSV detected - use schema extraction (--extract-mode schema) for 99% token reduction"
|
|
102
|
+
elif file_type == FileType.JSON:
|
|
103
|
+
return "⚠ Large JSON detected - use schema extraction (--extract-mode schema) for 95% token reduction"
|
|
104
|
+
elif file_type == FileType.LOG:
|
|
105
|
+
return "⚠ Large log file detected - use error extraction (--extract errors) for 90% token reduction"
|
|
106
|
+
elif file_type in [FileType.PYTHON, FileType.JAVASCRIPT, FileType.JAVA, FileType.GO]:
|
|
107
|
+
return "⚠ Large code file detected - use code extraction (--extract summary) for 80% token reduction"
|
|
108
|
+
elif estimated_tokens > context_window:
|
|
109
|
+
return "✗ File exceeds model context - chunking required (--chunked)"
|
|
110
|
+
else:
|
|
111
|
+
return "⚠ File near context limit - chunking recommended (--chunked) for 90% token reduction"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def analyze_file(
|
|
115
|
+
file_path: Path,
|
|
116
|
+
provider: str = "openai",
|
|
117
|
+
model: str = "gpt-4o",
|
|
118
|
+
threshold: float = 0.8
|
|
119
|
+
) -> FileAnalysis:
|
|
120
|
+
"""
|
|
121
|
+
Analyze a file and provide recommendations for processing.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
file_path: Path to the file to analyze
|
|
125
|
+
provider: LLM provider for token estimation
|
|
126
|
+
model: LLM model for context window limits
|
|
127
|
+
threshold: Warning threshold (default 0.8 = 80%)
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
FileAnalysis object with complete analysis
|
|
131
|
+
|
|
132
|
+
Raises:
|
|
133
|
+
FileNotFoundError: If file doesn't exist
|
|
134
|
+
|
|
135
|
+
Examples:
|
|
136
|
+
>>> analysis = analyze_file(Path("data.csv"), "openai", "gpt-4o")
|
|
137
|
+
>>> print(f"Tokens: {analysis.estimated_tokens}")
|
|
138
|
+
>>> print(analysis.recommendation)
|
|
139
|
+
"""
|
|
140
|
+
if not file_path.exists():
|
|
141
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
142
|
+
|
|
143
|
+
# Get file info
|
|
144
|
+
file_size_bytes = file_path.stat().st_size
|
|
145
|
+
file_size_mb = file_size_bytes / (1024 * 1024)
|
|
146
|
+
|
|
147
|
+
# Detect file type
|
|
148
|
+
file_type = detect_file_type(file_path)
|
|
149
|
+
|
|
150
|
+
# Read file content (with size limit for very large files)
|
|
151
|
+
MAX_READ_SIZE = 10 * 1024 * 1024 # 10MB max for token estimation
|
|
152
|
+
try:
|
|
153
|
+
if file_size_bytes > MAX_READ_SIZE:
|
|
154
|
+
# For very large files, estimate based on sample
|
|
155
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
156
|
+
sample = f.read(MAX_READ_SIZE)
|
|
157
|
+
# Extrapolate token count
|
|
158
|
+
sample_tokens = estimate_tokens(sample, provider, model)
|
|
159
|
+
estimated_tokens = int(sample_tokens * (file_size_bytes / len(sample)))
|
|
160
|
+
content_length = file_size_bytes # Approximate
|
|
161
|
+
else:
|
|
162
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
163
|
+
content = f.read()
|
|
164
|
+
content_length = len(content)
|
|
165
|
+
estimated_tokens = estimate_tokens(content, provider, model)
|
|
166
|
+
except UnicodeDecodeError:
|
|
167
|
+
# Binary file - rough estimate based on size
|
|
168
|
+
content_length = file_size_bytes
|
|
169
|
+
estimated_tokens = int(file_size_bytes / 4) # Very rough estimate
|
|
170
|
+
|
|
171
|
+
# Check token limits
|
|
172
|
+
exceeds_threshold, context_window, percentage_used = check_token_limit(
|
|
173
|
+
estimated_tokens, provider, model, threshold
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Get recommendation
|
|
177
|
+
recommendation = get_recommendation(
|
|
178
|
+
file_type, estimated_tokens, context_window, percentage_used
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return FileAnalysis(
|
|
182
|
+
file_path=file_path,
|
|
183
|
+
file_type=file_type,
|
|
184
|
+
file_size_bytes=file_size_bytes,
|
|
185
|
+
file_size_mb=file_size_mb,
|
|
186
|
+
content_length=content_length,
|
|
187
|
+
estimated_tokens=estimated_tokens,
|
|
188
|
+
exceeds_threshold=exceeds_threshold,
|
|
189
|
+
context_window=context_window,
|
|
190
|
+
percentage_used=percentage_used,
|
|
191
|
+
recommendation=recommendation
|
|
192
|
+
)
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""JSON schema extraction for intelligent file analysis.
|
|
2
|
+
|
|
3
|
+
This module extracts compact schema information from JSON files to reduce
|
|
4
|
+
token usage by 95%+ while preserving essential structure information.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional, Any, Union
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class JSONSchema:
|
|
15
|
+
"""Schema information for a JSON structure."""
|
|
16
|
+
type: str # object, array, string, number, boolean, null
|
|
17
|
+
keys: Optional[List[str]] = None # For objects
|
|
18
|
+
value_schema: Optional['JSONSchema'] = None # For arrays
|
|
19
|
+
nested_schemas: Optional[Dict[str, 'JSONSchema']] = None # For objects
|
|
20
|
+
sample_values: Optional[List[Any]] = None # For primitives and arrays
|
|
21
|
+
depth: int = 0
|
|
22
|
+
|
|
23
|
+
def to_text(self, indent: int = 0) -> str:
|
|
24
|
+
"""Convert schema to human-readable text representation.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
indent: Current indentation level
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Formatted schema description
|
|
31
|
+
"""
|
|
32
|
+
prefix = " " * indent
|
|
33
|
+
lines = []
|
|
34
|
+
|
|
35
|
+
if self.type == "object":
|
|
36
|
+
lines.append(f"{prefix}Object with {len(self.keys or [])} keys:")
|
|
37
|
+
if self.nested_schemas:
|
|
38
|
+
for key, schema in self.nested_schemas.items():
|
|
39
|
+
lines.append(f"{prefix} {key}:")
|
|
40
|
+
lines.append(schema.to_text(indent + 2))
|
|
41
|
+
|
|
42
|
+
elif self.type == "array":
|
|
43
|
+
lines.append(f"{prefix}Array:")
|
|
44
|
+
if self.value_schema:
|
|
45
|
+
lines.append(f"{prefix} Elements:")
|
|
46
|
+
lines.append(self.value_schema.to_text(indent + 2))
|
|
47
|
+
if self.sample_values:
|
|
48
|
+
sample_str = ", ".join(str(v)[:50] for v in self.sample_values[:3])
|
|
49
|
+
lines.append(f"{prefix} Sample: [{sample_str}]")
|
|
50
|
+
|
|
51
|
+
else:
|
|
52
|
+
# Primitive type
|
|
53
|
+
lines.append(f"{prefix}{self.type}")
|
|
54
|
+
if self.sample_values:
|
|
55
|
+
sample_str = ", ".join(str(v)[:50] for v in self.sample_values[:5])
|
|
56
|
+
lines.append(f"{prefix} Samples: {sample_str}")
|
|
57
|
+
|
|
58
|
+
return "\n".join(lines)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def infer_json_schema(
|
|
62
|
+
data: Any,
|
|
63
|
+
max_depth: int = 10,
|
|
64
|
+
current_depth: int = 0,
|
|
65
|
+
sample_size: int = 3
|
|
66
|
+
) -> JSONSchema:
|
|
67
|
+
"""Infer schema from JSON data structure.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
data: JSON data (dict, list, or primitive)
|
|
71
|
+
max_depth: Maximum nesting depth to analyze
|
|
72
|
+
current_depth: Current depth in recursion
|
|
73
|
+
sample_size: Number of sample values to collect
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
JSONSchema object describing the structure
|
|
77
|
+
"""
|
|
78
|
+
if current_depth >= max_depth:
|
|
79
|
+
return JSONSchema(type="...", depth=current_depth)
|
|
80
|
+
|
|
81
|
+
if data is None:
|
|
82
|
+
return JSONSchema(type="null", depth=current_depth)
|
|
83
|
+
|
|
84
|
+
elif isinstance(data, bool):
|
|
85
|
+
return JSONSchema(
|
|
86
|
+
type="boolean",
|
|
87
|
+
sample_values=[data],
|
|
88
|
+
depth=current_depth
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
elif isinstance(data, (int, float)):
|
|
92
|
+
return JSONSchema(
|
|
93
|
+
type="number",
|
|
94
|
+
sample_values=[data],
|
|
95
|
+
depth=current_depth
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
elif isinstance(data, str):
|
|
99
|
+
return JSONSchema(
|
|
100
|
+
type="string",
|
|
101
|
+
sample_values=[data[:100]], # Truncate long strings
|
|
102
|
+
depth=current_depth
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
elif isinstance(data, list):
|
|
106
|
+
# Analyze array elements
|
|
107
|
+
sample_values = data[:sample_size] if data else []
|
|
108
|
+
|
|
109
|
+
# Infer schema from first element (assuming homogeneous array)
|
|
110
|
+
value_schema = None
|
|
111
|
+
if data:
|
|
112
|
+
value_schema = infer_json_schema(
|
|
113
|
+
data[0],
|
|
114
|
+
max_depth,
|
|
115
|
+
current_depth + 1,
|
|
116
|
+
sample_size
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return JSONSchema(
|
|
120
|
+
type="array",
|
|
121
|
+
value_schema=value_schema,
|
|
122
|
+
sample_values=sample_values,
|
|
123
|
+
depth=current_depth
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
elif isinstance(data, dict):
|
|
127
|
+
# Analyze object keys and values
|
|
128
|
+
keys = list(data.keys())
|
|
129
|
+
nested_schemas = {}
|
|
130
|
+
|
|
131
|
+
for key in keys:
|
|
132
|
+
nested_schemas[key] = infer_json_schema(
|
|
133
|
+
data[key],
|
|
134
|
+
max_depth,
|
|
135
|
+
current_depth + 1,
|
|
136
|
+
sample_size
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return JSONSchema(
|
|
140
|
+
type="object",
|
|
141
|
+
keys=keys,
|
|
142
|
+
nested_schemas=nested_schemas,
|
|
143
|
+
depth=current_depth
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
else:
|
|
147
|
+
return JSONSchema(type="unknown", depth=current_depth)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def extract_json_schema(file_path: Path) -> Dict[str, Any]:
|
|
151
|
+
"""Extract schema information from a JSON file.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
file_path: Path to JSON file
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Dictionary with schema and metadata
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
FileNotFoundError: If file doesn't exist
|
|
161
|
+
json.JSONDecodeError: If JSON is malformed
|
|
162
|
+
"""
|
|
163
|
+
if not file_path.exists():
|
|
164
|
+
raise FileNotFoundError(f"JSON file not found: {file_path}")
|
|
165
|
+
|
|
166
|
+
# Read and parse JSON
|
|
167
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
168
|
+
data = json.load(f)
|
|
169
|
+
|
|
170
|
+
# Infer schema
|
|
171
|
+
schema = infer_json_schema(data)
|
|
172
|
+
schema_text = schema.to_text()
|
|
173
|
+
|
|
174
|
+
# Calculate sizes
|
|
175
|
+
original_size = file_path.stat().st_size
|
|
176
|
+
schema_size = len(schema_text)
|
|
177
|
+
|
|
178
|
+
# Determine structure type
|
|
179
|
+
if isinstance(data, dict):
|
|
180
|
+
structure = f"Object with {len(data)} keys"
|
|
181
|
+
elif isinstance(data, list):
|
|
182
|
+
structure = f"Array with {len(data)} elements"
|
|
183
|
+
else:
|
|
184
|
+
structure = f"Primitive: {type(data).__name__}"
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
'schema': schema,
|
|
188
|
+
'schema_text': schema_text,
|
|
189
|
+
'structure': structure,
|
|
190
|
+
'original_size_bytes': original_size,
|
|
191
|
+
'schema_size_bytes': schema_size,
|
|
192
|
+
'token_reduction_pct': ((original_size - schema_size) / original_size * 100) if original_size > 0 else 0.0,
|
|
193
|
+
'recommended_action': 'Use schema for LLM analysis instead of full JSON'
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def analyze_json_file(file_path: Path) -> str:
|
|
198
|
+
"""Analyze a JSON file and return schema description.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
file_path: Path to JSON file
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Human-readable schema description
|
|
205
|
+
"""
|
|
206
|
+
result = extract_json_schema(file_path)
|
|
207
|
+
|
|
208
|
+
lines = [
|
|
209
|
+
f"JSON File: {file_path}",
|
|
210
|
+
f"Structure: {result['structure']}",
|
|
211
|
+
f"Original size: {result['original_size_bytes']:,} bytes",
|
|
212
|
+
f"Schema size: {result['schema_size_bytes']:,} bytes",
|
|
213
|
+
f"Token reduction: {result['token_reduction_pct']:.1f}%",
|
|
214
|
+
"",
|
|
215
|
+
"Schema:",
|
|
216
|
+
result['schema_text']
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
return "\n".join(lines)
|