h2ogpte 1.6.42__py3-none-any.whl → 1.6.43rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- h2ogpte/__init__.py +1 -1
- h2ogpte/cli/__init__.py +0 -0
- h2ogpte/cli/commands/__init__.py +0 -0
- h2ogpte/cli/commands/command_handlers/__init__.py +0 -0
- h2ogpte/cli/commands/command_handlers/agent.py +41 -0
- h2ogpte/cli/commands/command_handlers/chat.py +37 -0
- h2ogpte/cli/commands/command_handlers/clear.py +8 -0
- h2ogpte/cli/commands/command_handlers/collection.py +67 -0
- h2ogpte/cli/commands/command_handlers/config.py +113 -0
- h2ogpte/cli/commands/command_handlers/disconnect.py +36 -0
- h2ogpte/cli/commands/command_handlers/exit.py +37 -0
- h2ogpte/cli/commands/command_handlers/help.py +8 -0
- h2ogpte/cli/commands/command_handlers/history.py +29 -0
- h2ogpte/cli/commands/command_handlers/rag.py +146 -0
- h2ogpte/cli/commands/command_handlers/research_agent.py +45 -0
- h2ogpte/cli/commands/command_handlers/session.py +77 -0
- h2ogpte/cli/commands/command_handlers/status.py +33 -0
- h2ogpte/cli/commands/dispatcher.py +79 -0
- h2ogpte/cli/core/__init__.py +0 -0
- h2ogpte/cli/core/app.py +105 -0
- h2ogpte/cli/core/config.py +199 -0
- h2ogpte/cli/core/encryption.py +104 -0
- h2ogpte/cli/core/session.py +171 -0
- h2ogpte/cli/integrations/__init__.py +0 -0
- h2ogpte/cli/integrations/agent.py +338 -0
- h2ogpte/cli/integrations/rag.py +442 -0
- h2ogpte/cli/main.py +90 -0
- h2ogpte/cli/ui/__init__.py +0 -0
- h2ogpte/cli/ui/hbot_prompt.py +435 -0
- h2ogpte/cli/ui/prompts.py +129 -0
- h2ogpte/cli/ui/status_bar.py +133 -0
- h2ogpte/cli/utils/__init__.py +0 -0
- h2ogpte/cli/utils/file_manager.py +411 -0
- h2ogpte/h2ogpte.py +471 -67
- h2ogpte/h2ogpte_async.py +482 -68
- h2ogpte/h2ogpte_sync_base.py +8 -1
- h2ogpte/rest_async/__init__.py +6 -3
- h2ogpte/rest_async/api/chat_api.py +29 -0
- h2ogpte/rest_async/api/collections_api.py +293 -0
- h2ogpte/rest_async/api/extractors_api.py +2874 -70
- h2ogpte/rest_async/api/prompt_templates_api.py +32 -32
- h2ogpte/rest_async/api_client.py +1 -1
- h2ogpte/rest_async/configuration.py +1 -1
- h2ogpte/rest_async/models/__init__.py +5 -2
- h2ogpte/rest_async/models/chat_completion.py +4 -2
- h2ogpte/rest_async/models/chat_completion_delta.py +5 -3
- h2ogpte/rest_async/models/chat_completion_request.py +1 -1
- h2ogpte/rest_async/models/chat_session.py +4 -2
- h2ogpte/rest_async/models/chat_settings.py +1 -1
- h2ogpte/rest_async/models/collection.py +4 -2
- h2ogpte/rest_async/models/collection_create_request.py +4 -2
- h2ogpte/rest_async/models/create_chat_session_request.py +87 -0
- h2ogpte/rest_async/models/extraction_request.py +1 -1
- h2ogpte/rest_async/models/extractor.py +4 -2
- h2ogpte/rest_async/models/guardrails_settings.py +8 -4
- h2ogpte/rest_async/models/guardrails_settings_create_request.py +1 -1
- h2ogpte/rest_async/models/process_document_job_request.py +1 -1
- h2ogpte/rest_async/models/question_request.py +1 -1
- h2ogpte/rest_async/models/{reset_and_share_prompt_template_request.py → reset_and_share_request.py} +6 -6
- h2ogpte/{rest_sync/models/reset_and_share_prompt_template_with_groups_request.py → rest_async/models/reset_and_share_with_groups_request.py} +6 -6
- h2ogpte/rest_async/models/summarize_request.py +1 -1
- h2ogpte/rest_async/models/update_collection_workspace_request.py +87 -0
- h2ogpte/rest_async/models/update_extractor_privacy_request.py +87 -0
- h2ogpte/rest_sync/__init__.py +6 -3
- h2ogpte/rest_sync/api/chat_api.py +29 -0
- h2ogpte/rest_sync/api/collections_api.py +293 -0
- h2ogpte/rest_sync/api/extractors_api.py +2874 -70
- h2ogpte/rest_sync/api/prompt_templates_api.py +32 -32
- h2ogpte/rest_sync/api_client.py +1 -1
- h2ogpte/rest_sync/configuration.py +1 -1
- h2ogpte/rest_sync/models/__init__.py +5 -2
- h2ogpte/rest_sync/models/chat_completion.py +4 -2
- h2ogpte/rest_sync/models/chat_completion_delta.py +5 -3
- h2ogpte/rest_sync/models/chat_completion_request.py +1 -1
- h2ogpte/rest_sync/models/chat_session.py +4 -2
- h2ogpte/rest_sync/models/chat_settings.py +1 -1
- h2ogpte/rest_sync/models/collection.py +4 -2
- h2ogpte/rest_sync/models/collection_create_request.py +4 -2
- h2ogpte/rest_sync/models/create_chat_session_request.py +87 -0
- h2ogpte/rest_sync/models/extraction_request.py +1 -1
- h2ogpte/rest_sync/models/extractor.py +4 -2
- h2ogpte/rest_sync/models/guardrails_settings.py +8 -4
- h2ogpte/rest_sync/models/guardrails_settings_create_request.py +1 -1
- h2ogpte/rest_sync/models/process_document_job_request.py +1 -1
- h2ogpte/rest_sync/models/question_request.py +1 -1
- h2ogpte/rest_sync/models/{reset_and_share_prompt_template_request.py → reset_and_share_request.py} +6 -6
- h2ogpte/{rest_async/models/reset_and_share_prompt_template_with_groups_request.py → rest_sync/models/reset_and_share_with_groups_request.py} +6 -6
- h2ogpte/rest_sync/models/summarize_request.py +1 -1
- h2ogpte/rest_sync/models/update_collection_workspace_request.py +87 -0
- h2ogpte/rest_sync/models/update_extractor_privacy_request.py +87 -0
- h2ogpte/session.py +3 -2
- h2ogpte/session_async.py +22 -6
- h2ogpte/types.py +6 -0
- {h2ogpte-1.6.42.dist-info → h2ogpte-1.6.43rc1.dist-info}/METADATA +5 -1
- {h2ogpte-1.6.42.dist-info → h2ogpte-1.6.43rc1.dist-info}/RECORD +98 -59
- h2ogpte-1.6.43rc1.dist-info/entry_points.txt +2 -0
- {h2ogpte-1.6.42.dist-info → h2ogpte-1.6.43rc1.dist-info}/WHEEL +0 -0
- {h2ogpte-1.6.42.dist-info → h2ogpte-1.6.43rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from rich.console import Console
|
|
3
|
+
from rich.table import Table
|
|
4
|
+
from rich.panel import Panel
|
|
5
|
+
from rich.text import Text
|
|
6
|
+
from rich.layout import Layout
|
|
7
|
+
from rich.live import Live
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class StatusBar:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.console = Console()
|
|
14
|
+
self.username: Optional[str] = None
|
|
15
|
+
self.collection: Optional[str] = None
|
|
16
|
+
self.session: Optional[str] = None
|
|
17
|
+
self.is_connected: bool = False
|
|
18
|
+
|
|
19
|
+
def update_connection_status(self, connected: bool, username: Optional[str] = None):
|
|
20
|
+
self.is_connected = connected
|
|
21
|
+
self.username = username
|
|
22
|
+
|
|
23
|
+
def update_collection(self, collection: Optional[str] = None):
|
|
24
|
+
self.collection = collection
|
|
25
|
+
|
|
26
|
+
def update_session(self, session: Optional[str] = None):
|
|
27
|
+
self.session = session
|
|
28
|
+
|
|
29
|
+
def _create_status_section(
|
|
30
|
+
self,
|
|
31
|
+
label: str,
|
|
32
|
+
value: Optional[str],
|
|
33
|
+
icon: str,
|
|
34
|
+
connected_style: str = "green",
|
|
35
|
+
disconnected_style: str = "dim red",
|
|
36
|
+
) -> Text:
|
|
37
|
+
if value:
|
|
38
|
+
status_text = Text()
|
|
39
|
+
status_text.append(f"{icon} ", style="cyan")
|
|
40
|
+
status_text.append(f"{label}: ", style="dim")
|
|
41
|
+
status_text.append(f"{value}", style=connected_style)
|
|
42
|
+
return status_text
|
|
43
|
+
else:
|
|
44
|
+
status_text = Text()
|
|
45
|
+
status_text.append(f"{icon} ", style="dim")
|
|
46
|
+
status_text.append(f"{label}: ", style="dim")
|
|
47
|
+
status_text.append("Not set", style=disconnected_style)
|
|
48
|
+
return status_text
|
|
49
|
+
|
|
50
|
+
def render_fixed_bottom(self) -> str:
|
|
51
|
+
console = Console()
|
|
52
|
+
width = console.size.width
|
|
53
|
+
|
|
54
|
+
parts = []
|
|
55
|
+
|
|
56
|
+
if self.is_connected:
|
|
57
|
+
parts.append("🟢 Connected")
|
|
58
|
+
else:
|
|
59
|
+
parts.append("🔴 Disconnected")
|
|
60
|
+
|
|
61
|
+
if self.username and self.is_connected:
|
|
62
|
+
parts.append(f"👤 {self.username}")
|
|
63
|
+
else:
|
|
64
|
+
parts.append("👤 —")
|
|
65
|
+
|
|
66
|
+
if self.collection:
|
|
67
|
+
parts.append(f"📚 {self.collection}")
|
|
68
|
+
else:
|
|
69
|
+
parts.append("📚 —")
|
|
70
|
+
|
|
71
|
+
if self.session:
|
|
72
|
+
parts.append(f"💬 {self.session}")
|
|
73
|
+
else:
|
|
74
|
+
parts.append("💬 —")
|
|
75
|
+
|
|
76
|
+
status_content = " │ ".join(parts)
|
|
77
|
+
if len(status_content) > width - 2:
|
|
78
|
+
status_content = status_content[: width - 5] + "..."
|
|
79
|
+
else:
|
|
80
|
+
status_content = status_content.ljust(width - 2)
|
|
81
|
+
|
|
82
|
+
return status_content
|
|
83
|
+
|
|
84
|
+
def render_separator(self) -> str:
|
|
85
|
+
console = Console()
|
|
86
|
+
width = console.size.width
|
|
87
|
+
return "─" * width
|
|
88
|
+
|
|
89
|
+
def print_status_line(self):
|
|
90
|
+
console = Console()
|
|
91
|
+
|
|
92
|
+
separator = self.render_separator()
|
|
93
|
+
if self.is_connected:
|
|
94
|
+
console.print(separator, style="green")
|
|
95
|
+
else:
|
|
96
|
+
console.print(separator, style="dim red")
|
|
97
|
+
|
|
98
|
+
status = self.render_fixed_bottom()
|
|
99
|
+
if self.is_connected:
|
|
100
|
+
console.print(f" {status} ", style="white on green")
|
|
101
|
+
else:
|
|
102
|
+
console.print(f" {status} ", style="white on red")
|
|
103
|
+
|
|
104
|
+
def render_compact(self) -> str:
|
|
105
|
+
parts = []
|
|
106
|
+
|
|
107
|
+
if self.is_connected:
|
|
108
|
+
parts.append("🟢")
|
|
109
|
+
else:
|
|
110
|
+
parts.append("🔴")
|
|
111
|
+
|
|
112
|
+
if self.username and self.is_connected:
|
|
113
|
+
parts.append(f"👤 {self.username}")
|
|
114
|
+
else:
|
|
115
|
+
parts.append("👤 —")
|
|
116
|
+
|
|
117
|
+
if self.collection:
|
|
118
|
+
parts.append(f"📚 {self.collection}")
|
|
119
|
+
else:
|
|
120
|
+
parts.append("📚 —")
|
|
121
|
+
|
|
122
|
+
if self.session:
|
|
123
|
+
parts.append(f"💬 {self.session}")
|
|
124
|
+
else:
|
|
125
|
+
parts.append("💬 —")
|
|
126
|
+
|
|
127
|
+
return " │ ".join(parts)
|
|
128
|
+
|
|
129
|
+
def clear(self):
|
|
130
|
+
self.username = None
|
|
131
|
+
self.collection = None
|
|
132
|
+
self.session = None
|
|
133
|
+
self.is_connected = False
|
|
File without changes
|
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import mimetypes
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Dict, Any, Optional
|
|
5
|
+
|
|
6
|
+
import aiofiles
|
|
7
|
+
import git
|
|
8
|
+
import pathspec
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.filesize import decimal
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
from rich.tree import Tree
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FileManager:
|
|
18
|
+
"""Manages file operations and directory scanning."""
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
self.default_ignore_patterns = [
|
|
22
|
+
"__pycache__",
|
|
23
|
+
"*.pyc",
|
|
24
|
+
".git",
|
|
25
|
+
".svn",
|
|
26
|
+
".hg",
|
|
27
|
+
"node_modules",
|
|
28
|
+
"venv",
|
|
29
|
+
".env",
|
|
30
|
+
"*.log",
|
|
31
|
+
".DS_Store",
|
|
32
|
+
"Thumbs.db",
|
|
33
|
+
".idea",
|
|
34
|
+
".vscode",
|
|
35
|
+
"*.swp",
|
|
36
|
+
"*.swo",
|
|
37
|
+
"*~",
|
|
38
|
+
".cache",
|
|
39
|
+
"dist",
|
|
40
|
+
"build",
|
|
41
|
+
"*.egg-info",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
def get_gitignore_spec(self, directory: Path) -> Optional[pathspec.PathSpec]:
|
|
45
|
+
"""Get gitignore pathspec for a directory."""
|
|
46
|
+
gitignore_path = directory / ".gitignore"
|
|
47
|
+
if gitignore_path.exists():
|
|
48
|
+
with open(gitignore_path, "r") as f:
|
|
49
|
+
return pathspec.PathSpec.from_lines("gitwildmatch", f)
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
def should_ignore(
|
|
53
|
+
self, path: Path, gitignore_spec: Optional[pathspec.PathSpec] = None
|
|
54
|
+
) -> bool:
|
|
55
|
+
"""Check if a path should be ignored."""
|
|
56
|
+
# Check default ignore patterns
|
|
57
|
+
spec = pathspec.PathSpec.from_lines(
|
|
58
|
+
"gitwildmatch", self.default_ignore_patterns
|
|
59
|
+
)
|
|
60
|
+
if spec.match_file(path.name):
|
|
61
|
+
return True
|
|
62
|
+
|
|
63
|
+
# Check gitignore
|
|
64
|
+
if gitignore_spec and gitignore_spec.match_file(str(path)):
|
|
65
|
+
return True
|
|
66
|
+
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
def scan_directory(
|
|
70
|
+
self,
|
|
71
|
+
directory: Path,
|
|
72
|
+
patterns: Optional[List[str]] = None,
|
|
73
|
+
recursive: bool = True,
|
|
74
|
+
include_hidden: bool = False,
|
|
75
|
+
respect_gitignore: bool = True,
|
|
76
|
+
) -> List[Path]:
|
|
77
|
+
"""Scan directory for files matching patterns."""
|
|
78
|
+
if not directory.exists():
|
|
79
|
+
console.print(f"[red]Directory not found: {directory}[/red]")
|
|
80
|
+
return []
|
|
81
|
+
|
|
82
|
+
files = []
|
|
83
|
+
gitignore_spec = (
|
|
84
|
+
self.get_gitignore_spec(directory) if respect_gitignore else None
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def scan_dir(dir_path: Path):
|
|
88
|
+
try:
|
|
89
|
+
for item in dir_path.iterdir():
|
|
90
|
+
# Skip hidden files if not included
|
|
91
|
+
if not include_hidden and item.name.startswith("."):
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
# Check if should ignore
|
|
95
|
+
if self.should_ignore(item, gitignore_spec):
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
if item.is_file():
|
|
99
|
+
# Check patterns if provided
|
|
100
|
+
if patterns:
|
|
101
|
+
if any(item.match(pattern) for pattern in patterns):
|
|
102
|
+
files.append(item)
|
|
103
|
+
else:
|
|
104
|
+
files.append(item)
|
|
105
|
+
elif item.is_dir() and recursive:
|
|
106
|
+
scan_dir(item)
|
|
107
|
+
except PermissionError:
|
|
108
|
+
console.print(f"[yellow]Permission denied: {dir_path}[/yellow]")
|
|
109
|
+
|
|
110
|
+
scan_dir(directory)
|
|
111
|
+
return sorted(files)
|
|
112
|
+
|
|
113
|
+
def get_file_info(self, file_path: Path) -> Dict[str, Any]:
|
|
114
|
+
"""Get detailed information about a file."""
|
|
115
|
+
if not file_path.exists():
|
|
116
|
+
return {}
|
|
117
|
+
|
|
118
|
+
stat = file_path.stat()
|
|
119
|
+
mime_type, _ = mimetypes.guess_type(str(file_path))
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"path": str(file_path),
|
|
123
|
+
"name": file_path.name,
|
|
124
|
+
"size": stat.st_size,
|
|
125
|
+
"size_human": decimal(stat.st_size),
|
|
126
|
+
"mime_type": mime_type,
|
|
127
|
+
"extension": file_path.suffix,
|
|
128
|
+
"modified": stat.st_mtime,
|
|
129
|
+
"created": stat.st_ctime,
|
|
130
|
+
"is_text": self.is_text_file(file_path),
|
|
131
|
+
"hash": self.get_file_hash(file_path),
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
def is_text_file(self, file_path: Path) -> bool:
|
|
135
|
+
"""Check if a file is a text file."""
|
|
136
|
+
try:
|
|
137
|
+
with open(file_path, "rb") as f:
|
|
138
|
+
chunk = f.read(8192)
|
|
139
|
+
return not bool(chunk.translate(None, bytes(range(32, 127))))
|
|
140
|
+
except:
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
def get_file_hash(self, file_path: Path, algorithm: str = "md5") -> str:
|
|
144
|
+
"""Calculate file hash."""
|
|
145
|
+
hash_func = hashlib.new(algorithm)
|
|
146
|
+
try:
|
|
147
|
+
with open(file_path, "rb") as f:
|
|
148
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
149
|
+
hash_func.update(chunk)
|
|
150
|
+
return hash_func.hexdigest()
|
|
151
|
+
except:
|
|
152
|
+
return ""
|
|
153
|
+
|
|
154
|
+
def display_file_tree(self, directory: Path, max_depth: int = 3):
|
|
155
|
+
"""Display directory structure as a tree."""
|
|
156
|
+
tree = Tree(f"[bold cyan]{directory}[/bold cyan]")
|
|
157
|
+
|
|
158
|
+
def add_directory(tree_node, path: Path, depth: int = 0):
|
|
159
|
+
if depth >= max_depth:
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
items = sorted(path.iterdir(), key=lambda x: (not x.is_dir(), x.name))
|
|
164
|
+
for item in items:
|
|
165
|
+
if item.name.startswith("."):
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
if item.is_dir():
|
|
169
|
+
branch = tree_node.add(f"[blue]{item.name}/[/blue]")
|
|
170
|
+
add_directory(branch, item, depth + 1)
|
|
171
|
+
else:
|
|
172
|
+
size = decimal(item.stat().st_size)
|
|
173
|
+
tree_node.add(f"{item.name} [dim]({size})[/dim]")
|
|
174
|
+
except PermissionError:
|
|
175
|
+
tree_node.add("[red]Permission Denied[/red]")
|
|
176
|
+
|
|
177
|
+
add_directory(tree, directory)
|
|
178
|
+
console.print(tree)
|
|
179
|
+
|
|
180
|
+
def display_file_list(self, files: List[Path], show_details: bool = True):
|
|
181
|
+
"""Display list of files in a table."""
|
|
182
|
+
if not files:
|
|
183
|
+
console.print("[yellow]No files found[/yellow]")
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
if show_details:
|
|
187
|
+
table = Table(title=f"Files ({len(files)} total)")
|
|
188
|
+
table.add_column("Name", style="cyan")
|
|
189
|
+
table.add_column("Size", style="white", justify="right")
|
|
190
|
+
table.add_column("Type", style="dim")
|
|
191
|
+
table.add_column("Modified", style="dim")
|
|
192
|
+
|
|
193
|
+
for file in files[:50]: # Limit display to 50 files
|
|
194
|
+
info = self.get_file_info(file)
|
|
195
|
+
table.add_row(
|
|
196
|
+
file.name,
|
|
197
|
+
info.get("size_human", ""),
|
|
198
|
+
info.get("extension", ""),
|
|
199
|
+
self._format_timestamp(info.get("modified", 0)),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
console.print(table)
|
|
203
|
+
|
|
204
|
+
if len(files) > 50:
|
|
205
|
+
console.print(f"[dim]... and {len(files) - 50} more files[/dim]")
|
|
206
|
+
else:
|
|
207
|
+
for file in files:
|
|
208
|
+
console.print(f" • {file}")
|
|
209
|
+
|
|
210
|
+
def _format_timestamp(self, timestamp: float) -> str:
|
|
211
|
+
"""Format timestamp for display."""
|
|
212
|
+
from datetime import datetime
|
|
213
|
+
|
|
214
|
+
dt = datetime.fromtimestamp(timestamp)
|
|
215
|
+
return dt.strftime("%Y-%m-%d %H:%M")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class FileUploader:
|
|
219
|
+
"""Handles file upload operations."""
|
|
220
|
+
|
|
221
|
+
def __init__(self, file_manager: FileManager):
|
|
222
|
+
self.file_manager = file_manager
|
|
223
|
+
|
|
224
|
+
async def prepare_files(
|
|
225
|
+
self, paths: List[Path], chunk_size: int = 1000, chunk_overlap: int = 200
|
|
226
|
+
) -> List[Dict[str, Any]]:
|
|
227
|
+
"""Prepare files for upload with chunking if needed."""
|
|
228
|
+
prepared_files = []
|
|
229
|
+
|
|
230
|
+
for path in paths:
|
|
231
|
+
if not path.exists():
|
|
232
|
+
console.print(f"[red]File not found: {path}[/red]")
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
info = self.file_manager.get_file_info(path)
|
|
236
|
+
|
|
237
|
+
if info.get("is_text"):
|
|
238
|
+
chunks = await self.chunk_text_file(path, chunk_size, chunk_overlap)
|
|
239
|
+
for i, chunk in enumerate(chunks):
|
|
240
|
+
prepared_files.append(
|
|
241
|
+
{
|
|
242
|
+
"path": path,
|
|
243
|
+
"content": chunk,
|
|
244
|
+
"metadata": {
|
|
245
|
+
**info,
|
|
246
|
+
"chunk_index": i,
|
|
247
|
+
"total_chunks": len(chunks),
|
|
248
|
+
},
|
|
249
|
+
}
|
|
250
|
+
)
|
|
251
|
+
else:
|
|
252
|
+
# For binary files, just add metadata
|
|
253
|
+
prepared_files.append({"path": path, "content": None, "metadata": info})
|
|
254
|
+
|
|
255
|
+
return prepared_files
|
|
256
|
+
|
|
257
|
+
async def chunk_text_file(
|
|
258
|
+
self, file_path: Path, chunk_size: int, chunk_overlap: int
|
|
259
|
+
) -> List[str]:
|
|
260
|
+
"""Split text file into chunks."""
|
|
261
|
+
chunks = []
|
|
262
|
+
|
|
263
|
+
async with aiofiles.open(
|
|
264
|
+
file_path, "r", encoding="utf-8", errors="ignore"
|
|
265
|
+
) as f:
|
|
266
|
+
content = await f.read()
|
|
267
|
+
|
|
268
|
+
# Simple chunking by character count
|
|
269
|
+
start = 0
|
|
270
|
+
while start < len(content):
|
|
271
|
+
end = start + chunk_size
|
|
272
|
+
chunk = content[start:end]
|
|
273
|
+
|
|
274
|
+
# Try to break at sentence boundary
|
|
275
|
+
if end < len(content):
|
|
276
|
+
last_period = chunk.rfind(".")
|
|
277
|
+
last_newline = chunk.rfind("\n")
|
|
278
|
+
break_point = max(last_period, last_newline)
|
|
279
|
+
|
|
280
|
+
if break_point > chunk_size // 2:
|
|
281
|
+
chunk = content[start : start + break_point + 1]
|
|
282
|
+
end = start + break_point + 1
|
|
283
|
+
|
|
284
|
+
chunks.append(chunk)
|
|
285
|
+
start = end - chunk_overlap
|
|
286
|
+
|
|
287
|
+
return chunks
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class DirectoryAnalyzer:
|
|
291
|
+
"""Analyzes directory structure and content."""
|
|
292
|
+
|
|
293
|
+
def __init__(self, file_manager: FileManager):
|
|
294
|
+
self.file_manager = file_manager
|
|
295
|
+
|
|
296
|
+
async def analyze(self, directory: Path) -> Dict[str, Any]:
|
|
297
|
+
"""Analyze directory structure and content."""
|
|
298
|
+
console.print(f"[blue]Analyzing directory: {directory}[/blue]")
|
|
299
|
+
|
|
300
|
+
files = self.file_manager.scan_directory(directory)
|
|
301
|
+
|
|
302
|
+
# Categorize files
|
|
303
|
+
categories = {
|
|
304
|
+
"code": [],
|
|
305
|
+
"documents": [],
|
|
306
|
+
"data": [],
|
|
307
|
+
"config": [],
|
|
308
|
+
"other": [],
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
code_extensions = {
|
|
312
|
+
".py",
|
|
313
|
+
".js",
|
|
314
|
+
".ts",
|
|
315
|
+
".java",
|
|
316
|
+
".cpp",
|
|
317
|
+
".c",
|
|
318
|
+
".go",
|
|
319
|
+
".rs",
|
|
320
|
+
".rb",
|
|
321
|
+
}
|
|
322
|
+
doc_extensions = {".md", ".txt", ".pdf", ".docx", ".rst"}
|
|
323
|
+
data_extensions = {".json", ".csv", ".xml", ".yaml", ".yml"}
|
|
324
|
+
config_extensions = {".toml", ".ini", ".env", ".config"}
|
|
325
|
+
|
|
326
|
+
total_size = 0
|
|
327
|
+
for file in files:
|
|
328
|
+
info = self.file_manager.get_file_info(file)
|
|
329
|
+
total_size += info.get("size", 0)
|
|
330
|
+
|
|
331
|
+
ext = file.suffix.lower()
|
|
332
|
+
if ext in code_extensions:
|
|
333
|
+
categories["code"].append(file)
|
|
334
|
+
elif ext in doc_extensions:
|
|
335
|
+
categories["documents"].append(file)
|
|
336
|
+
elif ext in data_extensions:
|
|
337
|
+
categories["data"].append(file)
|
|
338
|
+
elif ext in config_extensions:
|
|
339
|
+
categories["config"].append(file)
|
|
340
|
+
else:
|
|
341
|
+
categories["other"].append(file)
|
|
342
|
+
|
|
343
|
+
# Check for git repository
|
|
344
|
+
is_git_repo = (directory / ".git").exists()
|
|
345
|
+
git_info = {}
|
|
346
|
+
if is_git_repo:
|
|
347
|
+
try:
|
|
348
|
+
repo = git.Repo(directory)
|
|
349
|
+
git_info = {
|
|
350
|
+
"branch": repo.active_branch.name,
|
|
351
|
+
"commits": len(list(repo.iter_commits())),
|
|
352
|
+
"remotes": [remote.name for remote in repo.remotes],
|
|
353
|
+
"modified_files": len(repo.index.diff(None)),
|
|
354
|
+
"untracked_files": len(repo.untracked_files),
|
|
355
|
+
}
|
|
356
|
+
except:
|
|
357
|
+
pass
|
|
358
|
+
|
|
359
|
+
analysis = {
|
|
360
|
+
"directory": str(directory),
|
|
361
|
+
"total_files": len(files),
|
|
362
|
+
"total_size": total_size,
|
|
363
|
+
"total_size_human": decimal(total_size),
|
|
364
|
+
"categories": {k: len(v) for k, v in categories.items()},
|
|
365
|
+
"file_types": categories,
|
|
366
|
+
"is_git_repo": is_git_repo,
|
|
367
|
+
"git_info": git_info,
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
self.display_analysis(analysis)
|
|
371
|
+
return analysis
|
|
372
|
+
|
|
373
|
+
def display_analysis(self, analysis: Dict[str, Any]):
|
|
374
|
+
"""Display directory analysis results."""
|
|
375
|
+
# Basic stats
|
|
376
|
+
stats_table = Table(title="Directory Analysis", show_header=False)
|
|
377
|
+
stats_table.add_column("Property", style="cyan")
|
|
378
|
+
stats_table.add_column("Value", style="white")
|
|
379
|
+
|
|
380
|
+
stats_table.add_row("Directory", analysis["directory"])
|
|
381
|
+
stats_table.add_row("Total Files", str(analysis["total_files"]))
|
|
382
|
+
stats_table.add_row("Total Size", analysis["total_size_human"])
|
|
383
|
+
|
|
384
|
+
console.print(stats_table)
|
|
385
|
+
|
|
386
|
+
# File categories
|
|
387
|
+
cat_table = Table(title="File Categories")
|
|
388
|
+
cat_table.add_column("Category", style="cyan")
|
|
389
|
+
cat_table.add_column("Count", style="white", justify="right")
|
|
390
|
+
|
|
391
|
+
for category, count in analysis["categories"].items():
|
|
392
|
+
if count > 0:
|
|
393
|
+
cat_table.add_row(category.capitalize(), str(count))
|
|
394
|
+
|
|
395
|
+
console.print(cat_table)
|
|
396
|
+
|
|
397
|
+
# Git info if available
|
|
398
|
+
if analysis["is_git_repo"] and analysis["git_info"]:
|
|
399
|
+
git_table = Table(title="Git Repository Info", show_header=False)
|
|
400
|
+
git_table.add_column("Property", style="cyan")
|
|
401
|
+
git_table.add_column("Value", style="white")
|
|
402
|
+
|
|
403
|
+
git_info = analysis["git_info"]
|
|
404
|
+
git_table.add_row("Branch", git_info.get("branch", "N/A"))
|
|
405
|
+
git_table.add_row("Commits", str(git_info.get("commits", 0)))
|
|
406
|
+
git_table.add_row("Modified Files", str(git_info.get("modified_files", 0)))
|
|
407
|
+
git_table.add_row(
|
|
408
|
+
"Untracked Files", str(git_info.get("untracked_files", 0))
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
console.print(git_table)
|