stratifyai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +5 -0
- cli/stratifyai_cli.py +1753 -0
- stratifyai/__init__.py +113 -0
- stratifyai/api_key_helper.py +372 -0
- stratifyai/caching.py +279 -0
- stratifyai/chat/__init__.py +54 -0
- stratifyai/chat/builder.py +366 -0
- stratifyai/chat/stratifyai_anthropic.py +194 -0
- stratifyai/chat/stratifyai_bedrock.py +200 -0
- stratifyai/chat/stratifyai_deepseek.py +194 -0
- stratifyai/chat/stratifyai_google.py +194 -0
- stratifyai/chat/stratifyai_grok.py +194 -0
- stratifyai/chat/stratifyai_groq.py +195 -0
- stratifyai/chat/stratifyai_ollama.py +201 -0
- stratifyai/chat/stratifyai_openai.py +209 -0
- stratifyai/chat/stratifyai_openrouter.py +201 -0
- stratifyai/chunking.py +158 -0
- stratifyai/client.py +292 -0
- stratifyai/config.py +1273 -0
- stratifyai/cost_tracker.py +257 -0
- stratifyai/embeddings.py +245 -0
- stratifyai/exceptions.py +91 -0
- stratifyai/models.py +59 -0
- stratifyai/providers/__init__.py +5 -0
- stratifyai/providers/anthropic.py +330 -0
- stratifyai/providers/base.py +183 -0
- stratifyai/providers/bedrock.py +634 -0
- stratifyai/providers/deepseek.py +39 -0
- stratifyai/providers/google.py +39 -0
- stratifyai/providers/grok.py +39 -0
- stratifyai/providers/groq.py +39 -0
- stratifyai/providers/ollama.py +43 -0
- stratifyai/providers/openai.py +344 -0
- stratifyai/providers/openai_compatible.py +372 -0
- stratifyai/providers/openrouter.py +39 -0
- stratifyai/py.typed +2 -0
- stratifyai/rag.py +381 -0
- stratifyai/retry.py +185 -0
- stratifyai/router.py +643 -0
- stratifyai/summarization.py +179 -0
- stratifyai/utils/__init__.py +11 -0
- stratifyai/utils/bedrock_validator.py +136 -0
- stratifyai/utils/code_extractor.py +327 -0
- stratifyai/utils/csv_extractor.py +197 -0
- stratifyai/utils/file_analyzer.py +192 -0
- stratifyai/utils/json_extractor.py +219 -0
- stratifyai/utils/log_extractor.py +267 -0
- stratifyai/utils/model_selector.py +324 -0
- stratifyai/utils/provider_validator.py +442 -0
- stratifyai/utils/token_counter.py +186 -0
- stratifyai/vectordb.py +344 -0
- stratifyai-0.1.0.dist-info/METADATA +263 -0
- stratifyai-0.1.0.dist-info/RECORD +57 -0
- stratifyai-0.1.0.dist-info/WHEEL +5 -0
- stratifyai-0.1.0.dist-info/entry_points.txt +2 -0
- stratifyai-0.1.0.dist-info/licenses/LICENSE +21 -0
- stratifyai-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Progressive summarization utilities for large files."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
|
|
5
|
+
|
|
6
|
+
from .client import LLMClient
|
|
7
|
+
from .models import ChatRequest, Message
|
|
8
|
+
from .chunking import chunk_content, get_chunk_metadata
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def summarize_chunk(
|
|
12
|
+
chunk: str,
|
|
13
|
+
client: LLMClient,
|
|
14
|
+
model: str = "gpt-4o-mini",
|
|
15
|
+
max_tokens: int = 1000,
|
|
16
|
+
context: Optional[str] = None
|
|
17
|
+
) -> str:
|
|
18
|
+
"""
|
|
19
|
+
Summarize a single chunk of content.
|
|
20
|
+
|
|
21
|
+
Uses a cheaper model for cost efficiency.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
chunk: The content chunk to summarize
|
|
25
|
+
client: LLMClient instance (already configured with provider)
|
|
26
|
+
model: Model to use for summarization (default: gpt-4o-mini for cost)
|
|
27
|
+
max_tokens: Maximum tokens for summary
|
|
28
|
+
context: Optional context about the overall document
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Summary of the chunk
|
|
32
|
+
"""
|
|
33
|
+
# Build prompt
|
|
34
|
+
if context:
|
|
35
|
+
prompt = f"""Summarize the following section from a larger document.
|
|
36
|
+
|
|
37
|
+
Context: {context}
|
|
38
|
+
|
|
39
|
+
Section to summarize:
|
|
40
|
+
{chunk}
|
|
41
|
+
|
|
42
|
+
Provide a concise summary that preserves key information."""
|
|
43
|
+
else:
|
|
44
|
+
prompt = f"""Summarize the following text concisely, preserving key information:
|
|
45
|
+
|
|
46
|
+
{chunk}"""
|
|
47
|
+
|
|
48
|
+
# Create request
|
|
49
|
+
request = ChatRequest(
|
|
50
|
+
model=model,
|
|
51
|
+
messages=[Message(role="user", content=prompt)],
|
|
52
|
+
max_tokens=max_tokens
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Get summary
|
|
56
|
+
response = client.chat_completion(request)
|
|
57
|
+
return response.content
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def summarize_chunks_progressive(
|
|
61
|
+
chunks: List[str],
|
|
62
|
+
client: LLMClient,
|
|
63
|
+
model: str = "gpt-4o-mini",
|
|
64
|
+
context: Optional[str] = None,
|
|
65
|
+
show_progress: bool = True
|
|
66
|
+
) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Progressively summarize multiple chunks.
|
|
69
|
+
|
|
70
|
+
Each chunk is summarized individually, then all summaries are combined.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
chunks: List of content chunks
|
|
74
|
+
client: LLMClient instance
|
|
75
|
+
model: Model to use for summarization
|
|
76
|
+
context: Optional context about the overall document
|
|
77
|
+
show_progress: Whether to show progress bar
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Combined summary of all chunks
|
|
81
|
+
"""
|
|
82
|
+
if not chunks:
|
|
83
|
+
return ""
|
|
84
|
+
|
|
85
|
+
if len(chunks) == 1:
|
|
86
|
+
return summarize_chunk(chunks[0], client, model, context=context)
|
|
87
|
+
|
|
88
|
+
summaries = []
|
|
89
|
+
|
|
90
|
+
if show_progress:
|
|
91
|
+
with Progress(
|
|
92
|
+
SpinnerColumn(),
|
|
93
|
+
TextColumn("[progress.description]{task.description}"),
|
|
94
|
+
BarColumn(),
|
|
95
|
+
TaskProgressColumn(),
|
|
96
|
+
) as progress:
|
|
97
|
+
task = progress.add_task(
|
|
98
|
+
f"[cyan]Summarizing {len(chunks)} chunks...",
|
|
99
|
+
total=len(chunks)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
for i, chunk in enumerate(chunks, 1):
|
|
103
|
+
summary = summarize_chunk(
|
|
104
|
+
chunk,
|
|
105
|
+
client,
|
|
106
|
+
model,
|
|
107
|
+
context=f"{context} (Part {i}/{len(chunks)})" if context else f"Part {i}/{len(chunks)}"
|
|
108
|
+
)
|
|
109
|
+
summaries.append(f"**Part {i}/{len(chunks)}:**\n{summary}")
|
|
110
|
+
progress.update(task, advance=1)
|
|
111
|
+
else:
|
|
112
|
+
for i, chunk in enumerate(chunks, 1):
|
|
113
|
+
summary = summarize_chunk(
|
|
114
|
+
chunk,
|
|
115
|
+
client,
|
|
116
|
+
model,
|
|
117
|
+
context=f"{context} (Part {i}/{len(chunks)})" if context else f"Part {i}/{len(chunks)}"
|
|
118
|
+
)
|
|
119
|
+
summaries.append(f"**Part {i}/{len(chunks)}:**\n{summary}")
|
|
120
|
+
|
|
121
|
+
# Combine summaries
|
|
122
|
+
combined = "\n\n".join(summaries)
|
|
123
|
+
|
|
124
|
+
# If combined summaries are still very long, summarize the summaries
|
|
125
|
+
if len(combined) > 10000: # Arbitrary threshold
|
|
126
|
+
final_summary = summarize_chunk(
|
|
127
|
+
combined,
|
|
128
|
+
client,
|
|
129
|
+
model,
|
|
130
|
+
context="Combined summaries of document sections"
|
|
131
|
+
)
|
|
132
|
+
return f"**Overall Summary:**\n{final_summary}\n\n**Detailed Summaries:**\n{combined}"
|
|
133
|
+
|
|
134
|
+
return combined
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def summarize_file(
|
|
138
|
+
content: str,
|
|
139
|
+
client: LLMClient,
|
|
140
|
+
chunk_size: int = 50000,
|
|
141
|
+
model: str = "gpt-4o-mini",
|
|
142
|
+
context: Optional[str] = None,
|
|
143
|
+
show_progress: bool = True
|
|
144
|
+
) -> dict:
|
|
145
|
+
"""
|
|
146
|
+
Summarize a large file using progressive chunking.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
content: Full file content
|
|
150
|
+
client: LLMClient instance
|
|
151
|
+
chunk_size: Size of chunks in characters
|
|
152
|
+
model: Model to use for summarization
|
|
153
|
+
context: Optional context about the document
|
|
154
|
+
show_progress: Whether to show progress
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Dictionary with summary and metadata
|
|
158
|
+
"""
|
|
159
|
+
# Chunk the content
|
|
160
|
+
chunks = chunk_content(content, chunk_size=chunk_size)
|
|
161
|
+
metadata = get_chunk_metadata(chunks)
|
|
162
|
+
|
|
163
|
+
# Summarize chunks
|
|
164
|
+
summary = summarize_chunks_progressive(
|
|
165
|
+
chunks,
|
|
166
|
+
client,
|
|
167
|
+
model=model,
|
|
168
|
+
context=context,
|
|
169
|
+
show_progress=show_progress
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return {
|
|
173
|
+
"summary": summary,
|
|
174
|
+
"original_length": len(content),
|
|
175
|
+
"summary_length": len(summary),
|
|
176
|
+
"reduction_percentage": round((1 - len(summary) / len(content)) * 100, 1),
|
|
177
|
+
"num_chunks": metadata["num_chunks"],
|
|
178
|
+
"chunk_metadata": metadata
|
|
179
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Utility modules for StratifyAI."""
|
|
2
|
+
|
|
3
|
+
from .token_counter import estimate_tokens, count_tokens_for_messages
|
|
4
|
+
from .file_analyzer import analyze_file, FileAnalysis
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"estimate_tokens",
|
|
8
|
+
"count_tokens_for_messages",
|
|
9
|
+
"analyze_file",
|
|
10
|
+
"FileAnalysis",
|
|
11
|
+
]
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Bedrock model validation utility.
|
|
2
|
+
|
|
3
|
+
Validates AWS Bedrock model availability using boto3.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import time
|
|
7
|
+
from typing import Dict, List, Any, Optional
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import boto3
|
|
11
|
+
from botocore.exceptions import ClientError, NoCredentialsError, BotoCoreError
|
|
12
|
+
BOTO3_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
BOTO3_AVAILABLE = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def validate_bedrock_models(
|
|
18
|
+
model_ids: List[str],
|
|
19
|
+
region_name: Optional[str] = None,
|
|
20
|
+
) -> Dict[str, Any]:
|
|
21
|
+
"""
|
|
22
|
+
Validate which Bedrock models are available in the user's AWS account/region.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
model_ids: List of model IDs to validate
|
|
26
|
+
region_name: AWS region (defaults to AWS_DEFAULT_REGION or us-east-1)
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Dict containing:
|
|
30
|
+
- valid_models: List of model IDs that are available
|
|
31
|
+
- invalid_models: List of model IDs that are NOT available
|
|
32
|
+
- validation_time_ms: Time taken to validate in milliseconds
|
|
33
|
+
- error: Error message if validation failed (None if successful)
|
|
34
|
+
"""
|
|
35
|
+
import os
|
|
36
|
+
|
|
37
|
+
result = {
|
|
38
|
+
"valid_models": [],
|
|
39
|
+
"invalid_models": [],
|
|
40
|
+
"validation_time_ms": 0,
|
|
41
|
+
"error": None,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if not BOTO3_AVAILABLE:
|
|
45
|
+
result["error"] = "boto3 not installed"
|
|
46
|
+
result["valid_models"] = model_ids # Assume all valid if can't check
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
start_time = time.time()
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
# Get region from env or default
|
|
53
|
+
region = region_name or os.getenv("AWS_DEFAULT_REGION", "us-east-1")
|
|
54
|
+
|
|
55
|
+
# Create bedrock client (not bedrock-runtime - we need list_foundation_models)
|
|
56
|
+
bedrock_client = boto3.client(
|
|
57
|
+
service_name="bedrock",
|
|
58
|
+
region_name=region
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Get available foundation models
|
|
62
|
+
response = bedrock_client.list_foundation_models()
|
|
63
|
+
available_model_ids = {model["modelId"] for model in response.get("modelSummaries", [])}
|
|
64
|
+
|
|
65
|
+
# Check each requested model
|
|
66
|
+
for model_id in model_ids:
|
|
67
|
+
if model_id in available_model_ids:
|
|
68
|
+
result["valid_models"].append(model_id)
|
|
69
|
+
else:
|
|
70
|
+
result["invalid_models"].append(model_id)
|
|
71
|
+
|
|
72
|
+
except NoCredentialsError:
|
|
73
|
+
result["error"] = "AWS credentials not configured"
|
|
74
|
+
result["valid_models"] = model_ids # Show all, let runtime handle auth
|
|
75
|
+
|
|
76
|
+
except ClientError as e:
|
|
77
|
+
error_code = e.response.get("Error", {}).get("Code", "Unknown")
|
|
78
|
+
error_msg = e.response.get("Error", {}).get("Message", str(e))
|
|
79
|
+
result["error"] = f"AWS API error ({error_code}): {error_msg}"
|
|
80
|
+
result["valid_models"] = model_ids # Show all on error
|
|
81
|
+
|
|
82
|
+
except BotoCoreError as e:
|
|
83
|
+
result["error"] = f"AWS connection error: {str(e)}"
|
|
84
|
+
result["valid_models"] = model_ids # Show all on error
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
result["error"] = f"Validation failed: {str(e)}"
|
|
88
|
+
result["valid_models"] = model_ids # Show all on error
|
|
89
|
+
|
|
90
|
+
finally:
|
|
91
|
+
result["validation_time_ms"] = int((time.time() - start_time) * 1000)
|
|
92
|
+
|
|
93
|
+
return result
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_validated_interactive_models(
|
|
97
|
+
region_name: Optional[str] = None,
|
|
98
|
+
) -> Dict[str, Any]:
|
|
99
|
+
"""
|
|
100
|
+
Get validated interactive Bedrock models with metadata.
|
|
101
|
+
|
|
102
|
+
This is a convenience function that validates the curated interactive models
|
|
103
|
+
and returns them with their display metadata.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
region_name: AWS region (defaults to AWS_DEFAULT_REGION or us-east-1)
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Dict containing:
|
|
110
|
+
- models: Dict mapping model_id to metadata (display_name, description, category)
|
|
111
|
+
- validation_result: Full validation result dict
|
|
112
|
+
"""
|
|
113
|
+
from ..config import INTERACTIVE_BEDROCK_MODELS, BEDROCK_MODELS
|
|
114
|
+
|
|
115
|
+
# Get list of interactive model IDs
|
|
116
|
+
model_ids = list(INTERACTIVE_BEDROCK_MODELS.keys())
|
|
117
|
+
|
|
118
|
+
# Validate
|
|
119
|
+
validation_result = validate_bedrock_models(model_ids, region_name)
|
|
120
|
+
|
|
121
|
+
# Build validated models dict with full metadata
|
|
122
|
+
models = {}
|
|
123
|
+
for model_id in validation_result["valid_models"]:
|
|
124
|
+
# Merge interactive metadata with full model config
|
|
125
|
+
interactive_meta = INTERACTIVE_BEDROCK_MODELS.get(model_id, {})
|
|
126
|
+
full_config = BEDROCK_MODELS.get(model_id, {})
|
|
127
|
+
|
|
128
|
+
models[model_id] = {
|
|
129
|
+
**full_config,
|
|
130
|
+
**interactive_meta,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
"models": models,
|
|
135
|
+
"validation_result": validation_result,
|
|
136
|
+
}
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
"""Code structure extraction for intelligent file analysis.
|
|
2
|
+
|
|
3
|
+
This module extracts structural information from code files using AST to reduce
|
|
4
|
+
token usage by 80%+ while preserving essential code structure.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import ast
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional, Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class FunctionInfo:
|
|
15
|
+
"""Information about a function."""
|
|
16
|
+
name: str
|
|
17
|
+
line_number: int
|
|
18
|
+
params: List[str]
|
|
19
|
+
returns: Optional[str] = None
|
|
20
|
+
docstring: Optional[str] = None
|
|
21
|
+
decorators: List[str] = field(default_factory=list)
|
|
22
|
+
is_async: bool = False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class ClassInfo:
|
|
27
|
+
"""Information about a class."""
|
|
28
|
+
name: str
|
|
29
|
+
line_number: int
|
|
30
|
+
bases: List[str]
|
|
31
|
+
methods: List[FunctionInfo]
|
|
32
|
+
docstring: Optional[str] = None
|
|
33
|
+
decorators: List[str] = field(default_factory=list)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class CodeStructure:
|
|
38
|
+
"""Complete code structure information."""
|
|
39
|
+
file_path: str
|
|
40
|
+
language: str
|
|
41
|
+
imports: List[str]
|
|
42
|
+
functions: List[FunctionInfo]
|
|
43
|
+
classes: List[ClassInfo]
|
|
44
|
+
total_lines: int
|
|
45
|
+
docstring: Optional[str] = None
|
|
46
|
+
|
|
47
|
+
def to_text(self) -> str:
|
|
48
|
+
"""Convert structure to human-readable text.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Formatted code structure
|
|
52
|
+
"""
|
|
53
|
+
lines = [
|
|
54
|
+
f"Code File: {self.file_path}",
|
|
55
|
+
f"Language: {self.language}",
|
|
56
|
+
f"Total Lines: {self.total_lines:,}",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
if self.docstring:
|
|
60
|
+
lines.append(f"\nModule Docstring:\n {self.docstring[:200]}")
|
|
61
|
+
|
|
62
|
+
# Imports
|
|
63
|
+
if self.imports:
|
|
64
|
+
lines.append(f"\nImports ({len(self.imports)}):")
|
|
65
|
+
for imp in self.imports[:20]: # Show first 20
|
|
66
|
+
lines.append(f" - {imp}")
|
|
67
|
+
if len(self.imports) > 20:
|
|
68
|
+
lines.append(f" ... and {len(self.imports) - 20} more")
|
|
69
|
+
|
|
70
|
+
# Functions
|
|
71
|
+
if self.functions:
|
|
72
|
+
lines.append(f"\nFunctions ({len(self.functions)}):")
|
|
73
|
+
for func in self.functions:
|
|
74
|
+
decorators = f"@{', @'.join(func.decorators)} " if func.decorators else ""
|
|
75
|
+
async_prefix = "async " if func.is_async else ""
|
|
76
|
+
params = ", ".join(func.params)
|
|
77
|
+
returns = f" -> {func.returns}" if func.returns else ""
|
|
78
|
+
lines.append(f" [Line {func.line_number}] {decorators}{async_prefix}def {func.name}({params}){returns}")
|
|
79
|
+
if func.docstring:
|
|
80
|
+
lines.append(f" \"{func.docstring[:100]}\"")
|
|
81
|
+
|
|
82
|
+
# Classes
|
|
83
|
+
if self.classes:
|
|
84
|
+
lines.append(f"\nClasses ({len(self.classes)}):")
|
|
85
|
+
for cls in self.classes:
|
|
86
|
+
decorators = f"@{', @'.join(cls.decorators)} " if cls.decorators else ""
|
|
87
|
+
bases = f"({', '.join(cls.bases)})" if cls.bases else ""
|
|
88
|
+
lines.append(f" [Line {cls.line_number}] {decorators}class {cls.name}{bases}:")
|
|
89
|
+
if cls.docstring:
|
|
90
|
+
lines.append(f" \"{cls.docstring[:100]}\"")
|
|
91
|
+
if cls.methods:
|
|
92
|
+
lines.append(f" Methods ({len(cls.methods)}):")
|
|
93
|
+
for method in cls.methods[:10]: # Show first 10 methods
|
|
94
|
+
params = ", ".join(method.params)
|
|
95
|
+
lines.append(f" - {method.name}({params})")
|
|
96
|
+
if len(cls.methods) > 10:
|
|
97
|
+
lines.append(f" ... and {len(cls.methods) - 10} more")
|
|
98
|
+
|
|
99
|
+
return "\n".join(lines)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class PythonASTVisitor(ast.NodeVisitor):
|
|
103
|
+
"""AST visitor to extract code structure from Python files."""
|
|
104
|
+
|
|
105
|
+
def __init__(self):
|
|
106
|
+
self.imports: List[str] = []
|
|
107
|
+
self.functions: List[FunctionInfo] = []
|
|
108
|
+
self.classes: List[ClassInfo] = []
|
|
109
|
+
self.current_class: Optional[str] = None
|
|
110
|
+
|
|
111
|
+
def visit_Import(self, node: ast.Import):
|
|
112
|
+
"""Visit import statement."""
|
|
113
|
+
for alias in node.names:
|
|
114
|
+
import_str = alias.name
|
|
115
|
+
if alias.asname:
|
|
116
|
+
import_str += f" as {alias.asname}"
|
|
117
|
+
self.imports.append(f"import {import_str}")
|
|
118
|
+
self.generic_visit(node)
|
|
119
|
+
|
|
120
|
+
def visit_ImportFrom(self, node: ast.ImportFrom):
|
|
121
|
+
"""Visit from...import statement."""
|
|
122
|
+
module = node.module or ""
|
|
123
|
+
for alias in node.names:
|
|
124
|
+
import_str = alias.name
|
|
125
|
+
if alias.asname:
|
|
126
|
+
import_str += f" as {alias.asname}"
|
|
127
|
+
self.imports.append(f"from {module} import {import_str}")
|
|
128
|
+
self.generic_visit(node)
|
|
129
|
+
|
|
130
|
+
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
131
|
+
"""Visit function definition."""
|
|
132
|
+
self._process_function(node, is_async=False)
|
|
133
|
+
self.generic_visit(node)
|
|
134
|
+
|
|
135
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
|
|
136
|
+
"""Visit async function definition."""
|
|
137
|
+
self._process_function(node, is_async=True)
|
|
138
|
+
self.generic_visit(node)
|
|
139
|
+
|
|
140
|
+
def _process_function(self, node, is_async: bool):
|
|
141
|
+
"""Process function/method node."""
|
|
142
|
+
# Extract parameters
|
|
143
|
+
params = []
|
|
144
|
+
for arg in node.args.args:
|
|
145
|
+
param_name = arg.arg
|
|
146
|
+
if arg.annotation:
|
|
147
|
+
try:
|
|
148
|
+
param_name += f": {ast.unparse(arg.annotation)}"
|
|
149
|
+
except:
|
|
150
|
+
pass
|
|
151
|
+
params.append(param_name)
|
|
152
|
+
|
|
153
|
+
# Extract return type
|
|
154
|
+
returns = None
|
|
155
|
+
if node.returns:
|
|
156
|
+
try:
|
|
157
|
+
returns = ast.unparse(node.returns)
|
|
158
|
+
except:
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
# Extract docstring
|
|
162
|
+
docstring = ast.get_docstring(node)
|
|
163
|
+
if docstring:
|
|
164
|
+
docstring = docstring.split('\n')[0] # First line only
|
|
165
|
+
|
|
166
|
+
# Extract decorators
|
|
167
|
+
decorators = []
|
|
168
|
+
for decorator in node.decorator_list:
|
|
169
|
+
try:
|
|
170
|
+
decorators.append(ast.unparse(decorator))
|
|
171
|
+
except:
|
|
172
|
+
decorators.append("@decorator")
|
|
173
|
+
|
|
174
|
+
func_info = FunctionInfo(
|
|
175
|
+
name=node.name,
|
|
176
|
+
line_number=node.lineno,
|
|
177
|
+
params=params,
|
|
178
|
+
returns=returns,
|
|
179
|
+
docstring=docstring,
|
|
180
|
+
decorators=decorators,
|
|
181
|
+
is_async=is_async
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Add to appropriate list
|
|
185
|
+
if self.current_class:
|
|
186
|
+
# Find the class and add method
|
|
187
|
+
for cls in self.classes:
|
|
188
|
+
if cls.name == self.current_class:
|
|
189
|
+
cls.methods.append(func_info)
|
|
190
|
+
break
|
|
191
|
+
else:
|
|
192
|
+
self.functions.append(func_info)
|
|
193
|
+
|
|
194
|
+
def visit_ClassDef(self, node: ast.ClassDef):
|
|
195
|
+
"""Visit class definition."""
|
|
196
|
+
# Extract base classes
|
|
197
|
+
bases = []
|
|
198
|
+
for base in node.bases:
|
|
199
|
+
try:
|
|
200
|
+
bases.append(ast.unparse(base))
|
|
201
|
+
except:
|
|
202
|
+
bases.append("BaseClass")
|
|
203
|
+
|
|
204
|
+
# Extract docstring
|
|
205
|
+
docstring = ast.get_docstring(node)
|
|
206
|
+
if docstring:
|
|
207
|
+
docstring = docstring.split('\n')[0] # First line only
|
|
208
|
+
|
|
209
|
+
# Extract decorators
|
|
210
|
+
decorators = []
|
|
211
|
+
for decorator in node.decorator_list:
|
|
212
|
+
try:
|
|
213
|
+
decorators.append(ast.unparse(decorator))
|
|
214
|
+
except:
|
|
215
|
+
decorators.append("@decorator")
|
|
216
|
+
|
|
217
|
+
class_info = ClassInfo(
|
|
218
|
+
name=node.name,
|
|
219
|
+
line_number=node.lineno,
|
|
220
|
+
bases=bases,
|
|
221
|
+
methods=[],
|
|
222
|
+
docstring=docstring,
|
|
223
|
+
decorators=decorators
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
self.classes.append(class_info)
|
|
227
|
+
|
|
228
|
+
# Visit methods
|
|
229
|
+
old_class = self.current_class
|
|
230
|
+
self.current_class = node.name
|
|
231
|
+
self.generic_visit(node)
|
|
232
|
+
self.current_class = old_class
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def extract_python_structure(file_path: Path) -> CodeStructure:
|
|
236
|
+
"""Extract structure from Python file using AST.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
file_path: Path to Python file
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
CodeStructure object
|
|
243
|
+
|
|
244
|
+
Raises:
|
|
245
|
+
FileNotFoundError: If file doesn't exist
|
|
246
|
+
SyntaxError: If Python code is malformed
|
|
247
|
+
"""
|
|
248
|
+
if not file_path.exists():
|
|
249
|
+
raise FileNotFoundError(f"Python file not found: {file_path}")
|
|
250
|
+
|
|
251
|
+
# Read file
|
|
252
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
253
|
+
source_code = f.read()
|
|
254
|
+
|
|
255
|
+
# Parse AST
|
|
256
|
+
try:
|
|
257
|
+
tree = ast.parse(source_code, filename=str(file_path))
|
|
258
|
+
except SyntaxError as e:
|
|
259
|
+
raise SyntaxError(f"Failed to parse {file_path}: {e}")
|
|
260
|
+
|
|
261
|
+
# Extract module docstring
|
|
262
|
+
docstring = ast.get_docstring(tree)
|
|
263
|
+
if docstring:
|
|
264
|
+
docstring = docstring.split('\n')[0] # First line only
|
|
265
|
+
|
|
266
|
+
# Visit AST
|
|
267
|
+
visitor = PythonASTVisitor()
|
|
268
|
+
visitor.visit(tree)
|
|
269
|
+
|
|
270
|
+
# Count lines
|
|
271
|
+
total_lines = source_code.count('\n') + 1
|
|
272
|
+
|
|
273
|
+
return CodeStructure(
|
|
274
|
+
file_path=str(file_path),
|
|
275
|
+
language="Python",
|
|
276
|
+
imports=visitor.imports,
|
|
277
|
+
functions=visitor.functions,
|
|
278
|
+
classes=visitor.classes,
|
|
279
|
+
total_lines=total_lines,
|
|
280
|
+
docstring=docstring
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def analyze_code_file(file_path: Path) -> Dict[str, Any]:
|
|
285
|
+
"""Analyze a code file and return structure information.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
file_path: Path to code file
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Dictionary with structure and metadata
|
|
292
|
+
"""
|
|
293
|
+
# Detect language from extension
|
|
294
|
+
extension = file_path.suffix.lower()
|
|
295
|
+
|
|
296
|
+
if extension == '.py':
|
|
297
|
+
structure = extract_python_structure(file_path)
|
|
298
|
+
structure_text = structure.to_text()
|
|
299
|
+
|
|
300
|
+
# Calculate sizes
|
|
301
|
+
original_size = file_path.stat().st_size
|
|
302
|
+
structure_size = len(structure_text)
|
|
303
|
+
|
|
304
|
+
return {
|
|
305
|
+
'structure': structure,
|
|
306
|
+
'structure_text': structure_text,
|
|
307
|
+
'original_size_bytes': original_size,
|
|
308
|
+
'structure_size_bytes': structure_size,
|
|
309
|
+
'token_reduction_pct': ((original_size - structure_size) / original_size * 100) if original_size > 0 else 0.0,
|
|
310
|
+
'recommended_action': 'Use structure for LLM analysis instead of full code'
|
|
311
|
+
}
|
|
312
|
+
else:
|
|
313
|
+
# For non-Python files, return basic info
|
|
314
|
+
original_size = file_path.stat().st_size
|
|
315
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
316
|
+
lines = f.readlines()
|
|
317
|
+
|
|
318
|
+
structure_text = f"Code File: {file_path}\nLanguage: {extension[1:] if extension else 'unknown'}\nTotal Lines: {len(lines)}\n\nNote: AST extraction only available for Python files."
|
|
319
|
+
|
|
320
|
+
return {
|
|
321
|
+
'structure': None,
|
|
322
|
+
'structure_text': structure_text,
|
|
323
|
+
'original_size_bytes': original_size,
|
|
324
|
+
'structure_size_bytes': len(structure_text),
|
|
325
|
+
'token_reduction_pct': 0.0,
|
|
326
|
+
'recommended_action': 'Full file analysis required (non-Python)'
|
|
327
|
+
}
|