wcgw 2.4.3__py3-none-any.whl → 2.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wcgw might be problematic. Click here for more details.
- wcgw/client/anthropic_client.py +7 -17
- wcgw/client/common.py +3 -1
- wcgw/client/mcp_server/server.py +41 -36
- wcgw/client/openai_client.py +21 -36
- wcgw/client/openai_utils.py +5 -5
- wcgw/client/repo_ops/display_tree.py +127 -0
- wcgw/client/repo_ops/path_prob.py +58 -0
- wcgw/client/repo_ops/paths_model.vocab +20000 -0
- wcgw/client/repo_ops/paths_tokens.model +80042 -0
- wcgw/client/repo_ops/repo_context.py +148 -0
- wcgw/client/tools.py +220 -115
- wcgw/relay/serve.py +3 -3
- wcgw/types_.py +6 -4
- {wcgw-2.4.3.dist-info → wcgw-2.6.1.dist-info}/METADATA +19 -56
- {wcgw-2.4.3.dist-info → wcgw-2.6.1.dist-info}/RECORD +18 -12
- wcgw-2.6.1.dist-info/licenses/LICENSE +213 -0
- {wcgw-2.4.3.dist-info → wcgw-2.6.1.dist-info}/WHEEL +0 -0
- {wcgw-2.4.3.dist-info → wcgw-2.6.1.dist-info}/entry_points.txt +0 -0
wcgw/client/anthropic_client.py
CHANGED
|
@@ -29,7 +29,7 @@ from ..types_ import (
|
|
|
29
29
|
FileEdit,
|
|
30
30
|
Keyboard,
|
|
31
31
|
Mouse,
|
|
32
|
-
|
|
32
|
+
ReadFiles,
|
|
33
33
|
ReadImage,
|
|
34
34
|
ResetShell,
|
|
35
35
|
ScreenShot,
|
|
@@ -41,12 +41,7 @@ from .common import CostData
|
|
|
41
41
|
from .tools import ImageData
|
|
42
42
|
from .computer_use import Computer
|
|
43
43
|
|
|
44
|
-
from .tools import
|
|
45
|
-
DoneFlag,
|
|
46
|
-
get_tool_output,
|
|
47
|
-
which_tool_name,
|
|
48
|
-
)
|
|
49
|
-
import tiktoken
|
|
44
|
+
from .tools import DoneFlag, get_tool_output, which_tool_name, default_enc
|
|
50
45
|
|
|
51
46
|
from urllib import parse
|
|
52
47
|
import subprocess
|
|
@@ -156,10 +151,6 @@ def loop(
|
|
|
156
151
|
|
|
157
152
|
limit = 1
|
|
158
153
|
|
|
159
|
-
enc = tiktoken.encoding_for_model(
|
|
160
|
-
"gpt-4o-2024-08-06",
|
|
161
|
-
)
|
|
162
|
-
|
|
163
154
|
tools = [
|
|
164
155
|
ToolParam(
|
|
165
156
|
input_schema=BashCommand.model_json_schema(),
|
|
@@ -192,12 +183,11 @@ def loop(
|
|
|
192
183
|
""",
|
|
193
184
|
),
|
|
194
185
|
ToolParam(
|
|
195
|
-
input_schema=
|
|
196
|
-
name="
|
|
186
|
+
input_schema=ReadFiles.model_json_schema(),
|
|
187
|
+
name="ReadFiles",
|
|
197
188
|
description="""
|
|
198
|
-
- Read full file content
|
|
199
|
-
- Provide absolute file
|
|
200
|
-
- Use this instead of 'cat' from BashCommand
|
|
189
|
+
- Read full file content of one or more files.
|
|
190
|
+
- Provide absolute file paths only
|
|
201
191
|
""",
|
|
202
192
|
),
|
|
203
193
|
ToolParam(
|
|
@@ -451,7 +441,7 @@ System information:
|
|
|
451
441
|
try:
|
|
452
442
|
output_or_dones, _ = get_tool_output(
|
|
453
443
|
tool_parsed,
|
|
454
|
-
|
|
444
|
+
default_enc,
|
|
455
445
|
limit - cost,
|
|
456
446
|
loop,
|
|
457
447
|
max_tokens=8000,
|
wcgw/client/common.py
CHANGED
|
@@ -38,7 +38,9 @@ def discard_input() -> None:
|
|
|
38
38
|
while True:
|
|
39
39
|
# Check if there is input to be read
|
|
40
40
|
if sys.stdin in select.select([sys.stdin], [], [], 0)[0]:
|
|
41
|
-
sys.stdin.read(
|
|
41
|
+
sys.stdin.read(
|
|
42
|
+
1
|
|
43
|
+
) # Read one character at a time to flush the input buffer
|
|
42
44
|
else:
|
|
43
45
|
break
|
|
44
46
|
finally:
|
wcgw/client/mcp_server/server.py
CHANGED
|
@@ -1,34 +1,33 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import importlib
|
|
3
2
|
import json
|
|
4
3
|
import os
|
|
5
|
-
import sys
|
|
6
|
-
import traceback
|
|
7
4
|
from typing import Any
|
|
8
5
|
|
|
9
|
-
from
|
|
6
|
+
from pydantic import AnyUrl, ValidationError
|
|
7
|
+
|
|
8
|
+
import mcp_wcgw.server.stdio
|
|
10
9
|
import mcp_wcgw.types as types
|
|
11
|
-
from mcp_wcgw.types import Tool as ToolParam
|
|
12
10
|
from mcp_wcgw.server import NotificationOptions, Server
|
|
13
|
-
from
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
from ..tools import DoneFlag, get_tool_output, which_tool_name, default_enc
|
|
11
|
+
from mcp_wcgw.server.models import InitializationOptions
|
|
12
|
+
from mcp_wcgw.types import Tool as ToolParam
|
|
13
|
+
|
|
17
14
|
from ...types_ import (
|
|
18
15
|
BashCommand,
|
|
19
16
|
BashInteraction,
|
|
20
|
-
WriteIfEmpty,
|
|
21
17
|
FileEdit,
|
|
18
|
+
GetScreenInfo,
|
|
19
|
+
Initialize,
|
|
22
20
|
Keyboard,
|
|
23
21
|
Mouse,
|
|
24
|
-
|
|
22
|
+
ReadFiles,
|
|
25
23
|
ReadImage,
|
|
26
24
|
ResetShell,
|
|
27
|
-
Initialize,
|
|
28
25
|
ScreenShot,
|
|
29
|
-
|
|
26
|
+
WriteIfEmpty,
|
|
30
27
|
)
|
|
28
|
+
from .. import tools
|
|
31
29
|
from ..computer_use import SLEEP_TIME_MAX_S
|
|
30
|
+
from ..tools import DoneFlag, default_enc, get_tool_output, which_tool_name
|
|
32
31
|
|
|
33
32
|
COMPUTER_USE_ON_DOCKER_ENABLED = False
|
|
34
33
|
|
|
@@ -76,7 +75,13 @@ async def handle_list_tools() -> list[types.Tool]:
|
|
|
76
75
|
inputSchema=Initialize.model_json_schema(),
|
|
77
76
|
name="Initialize",
|
|
78
77
|
description="""
|
|
79
|
-
- Always call this at the start of the conversation before
|
|
78
|
+
- Always call this at the start of the conversation before using any of the shell tools from wcgw.
|
|
79
|
+
- This will reset the shell.
|
|
80
|
+
- Use `any_workspace_path` to initialize the shell in the appropriate project directory.
|
|
81
|
+
- If the user has mentioned a workspace or project root, use it to set `any_workspace_path`.
|
|
82
|
+
- If the user has mentioned a folder or file with unclear project root, use the file or folder as `any_workspace_path`.
|
|
83
|
+
- If user has mentioned any files use `initial_files_to_read` to read, use absolute paths only.
|
|
84
|
+
- If `any_workspace_path` is provided, a tree structure of the workspace will be shown.
|
|
80
85
|
""",
|
|
81
86
|
),
|
|
82
87
|
ToolParam(
|
|
@@ -92,6 +97,7 @@ async def handle_list_tools() -> list[types.Tool]:
|
|
|
92
97
|
- The control will return to you in {SLEEP_TIME_MAX_S} seconds regardless of the status. For heavy commands, keep checking status using BashInteraction till they are finished.
|
|
93
98
|
- Run long running commands in background using screen instead of "&".
|
|
94
99
|
- Use longer wait_for_seconds if the command is expected to run for a long time.
|
|
100
|
+
- Do not use 'cat' to read files, use ReadFiles tool instead.
|
|
95
101
|
""",
|
|
96
102
|
),
|
|
97
103
|
ToolParam(
|
|
@@ -110,12 +116,11 @@ async def handle_list_tools() -> list[types.Tool]:
|
|
|
110
116
|
""",
|
|
111
117
|
),
|
|
112
118
|
ToolParam(
|
|
113
|
-
inputSchema=
|
|
114
|
-
name="
|
|
119
|
+
inputSchema=ReadFiles.model_json_schema(),
|
|
120
|
+
name="ReadFiles",
|
|
115
121
|
description="""
|
|
116
|
-
- Read full file content
|
|
117
|
-
- Provide absolute file
|
|
118
|
-
- Use this instead of 'cat' from BashCommand
|
|
122
|
+
- Read full file content of one or more files.
|
|
123
|
+
- Provide absolute file paths only
|
|
119
124
|
""",
|
|
120
125
|
),
|
|
121
126
|
ToolParam(
|
|
@@ -236,24 +241,24 @@ async def handle_call_tool(
|
|
|
236
241
|
if isinstance(output_or_done, str):
|
|
237
242
|
if issubclass(tool_type, Initialize):
|
|
238
243
|
output_or_done += """
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
244
|
+
---
|
|
245
|
+
You're an expert software engineer with shell and code knowledge.
|
|
246
|
+
|
|
247
|
+
Instructions:
|
|
248
|
+
|
|
249
|
+
- You should use the provided bash execution, reading and writing file tools to complete objective.
|
|
250
|
+
- First understand about the project by getting the folder structure (ignoring .git, node_modules, venv, etc.)
|
|
251
|
+
- Always read relevant files before editing.
|
|
252
|
+
- Do not provide code snippets unless asked by the user, instead directly add/edit the code.
|
|
253
|
+
- Do not install new tools/packages before ensuring no such tools/package or an alternative already exists.
|
|
254
|
+
- Do not use artifacts if you have access to the repository and not asked by the user to provide artifacts/snippets. Directly create/update using shell tools.
|
|
255
|
+
- Do not use Ctrl-c or Ctrl-z or interrupt commands without asking the user, because often the program don't show any update but they still are running.
|
|
256
|
+
- Do not use echo to write multi-line files, always use FileEdit tool to update a code.
|
|
243
257
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
- Do not install new tools/packages before ensuring no such tools/package or an alternative already exists.
|
|
249
|
-
- Do not use artifacts if you have access to the repository and not asked by the user to provide artifacts/snippets. Directly create/update using shell tools.
|
|
250
|
-
- Do not use Ctrl-c or Ctrl-z or interrupt commands without asking the user, because often the program don't show any update but they still are running.
|
|
251
|
-
- Do not use echo to write multi-line files, always use FileEdit tool to update a code.
|
|
252
|
-
|
|
253
|
-
Additional instructions:
|
|
254
|
-
Always run `pwd` if you get any file or directory not found error to make sure you're not lost, or to get absolute cwd.
|
|
255
|
-
|
|
256
|
-
Always write production ready, syntactically correct code.
|
|
258
|
+
Additional instructions:
|
|
259
|
+
Always run `pwd` if you get any file or directory not found error to make sure you're not lost, or to get absolute cwd.
|
|
260
|
+
|
|
261
|
+
Always write production ready, syntactically correct code.
|
|
257
262
|
"""
|
|
258
263
|
|
|
259
264
|
content.append(types.TextContent(type="text", text=output_or_done))
|
wcgw/client/openai_client.py
CHANGED
|
@@ -1,57 +1,45 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import json
|
|
3
3
|
import mimetypes
|
|
4
|
-
|
|
5
|
-
import
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
6
7
|
import traceback
|
|
7
|
-
|
|
8
|
+
import uuid
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import DefaultDict, Optional, cast
|
|
11
|
+
|
|
8
12
|
import openai
|
|
13
|
+
import petname # type: ignore[import-untyped]
|
|
14
|
+
import rich
|
|
15
|
+
import tokenizers # type: ignore[import-untyped]
|
|
16
|
+
from dotenv import load_dotenv
|
|
9
17
|
from openai import OpenAI
|
|
10
18
|
from openai.types.chat import (
|
|
19
|
+
ChatCompletionContentPartParam,
|
|
11
20
|
ChatCompletionMessageParam,
|
|
12
|
-
ChatCompletionAssistantMessageParam,
|
|
13
21
|
ChatCompletionUserMessageParam,
|
|
14
|
-
ChatCompletionContentPartParam,
|
|
15
|
-
ChatCompletionMessage,
|
|
16
|
-
ParsedChatCompletionMessage,
|
|
17
22
|
)
|
|
18
|
-
import
|
|
19
|
-
import petname # type: ignore[import-untyped]
|
|
23
|
+
from pydantic import BaseModel
|
|
20
24
|
from typer import Typer
|
|
21
|
-
import uuid
|
|
22
25
|
|
|
23
26
|
from ..types_ import (
|
|
24
27
|
BashCommand,
|
|
25
28
|
BashInteraction,
|
|
26
|
-
WriteIfEmpty,
|
|
27
29
|
FileEdit,
|
|
30
|
+
ReadFiles,
|
|
28
31
|
ReadImage,
|
|
29
|
-
ReadFile,
|
|
30
32
|
ResetShell,
|
|
33
|
+
WriteIfEmpty,
|
|
31
34
|
)
|
|
32
|
-
|
|
33
|
-
from .common import Models, discard_input
|
|
34
|
-
from .common import CostData, History
|
|
35
|
+
from .common import CostData, History, Models, discard_input
|
|
35
36
|
from .openai_utils import get_input_cost, get_output_cost
|
|
36
|
-
from .tools import ImageData
|
|
37
|
-
|
|
38
37
|
from .tools import (
|
|
39
38
|
DoneFlag,
|
|
39
|
+
ImageData,
|
|
40
40
|
get_tool_output,
|
|
41
41
|
which_tool,
|
|
42
42
|
)
|
|
43
|
-
import tiktoken
|
|
44
|
-
|
|
45
|
-
from urllib import parse
|
|
46
|
-
import subprocess
|
|
47
|
-
import os
|
|
48
|
-
import tempfile
|
|
49
|
-
|
|
50
|
-
import toml
|
|
51
|
-
from pydantic import BaseModel
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
from dotenv import load_dotenv
|
|
55
43
|
|
|
56
44
|
|
|
57
45
|
class Config(BaseModel):
|
|
@@ -160,9 +148,7 @@ def loop(
|
|
|
160
148
|
config.cost_limit = limit
|
|
161
149
|
limit = config.cost_limit
|
|
162
150
|
|
|
163
|
-
enc =
|
|
164
|
-
config.model if not config.model.startswith("o1") else "gpt-4o"
|
|
165
|
-
)
|
|
151
|
+
enc = tokenizers.Tokenizer.from_pretrained("Xenova/gpt-4o")
|
|
166
152
|
|
|
167
153
|
tools = [
|
|
168
154
|
openai.pydantic_function_tool(
|
|
@@ -188,11 +174,10 @@ def loop(
|
|
|
188
174
|
- Only one of send_text, send_specials, send_ascii should be provided.""",
|
|
189
175
|
),
|
|
190
176
|
openai.pydantic_function_tool(
|
|
191
|
-
|
|
177
|
+
ReadFiles,
|
|
192
178
|
description="""
|
|
193
|
-
- Read full file content
|
|
194
|
-
- Provide absolute file
|
|
195
|
-
- Use this instead of 'cat' from BashCommand
|
|
179
|
+
- Read full file content of one or more files.
|
|
180
|
+
- Provide absolute file paths only
|
|
196
181
|
""",
|
|
197
182
|
),
|
|
198
183
|
openai.pydantic_function_tool(
|
wcgw/client/openai_utils.py
CHANGED
|
@@ -15,7 +15,7 @@ from openai.types.chat import (
|
|
|
15
15
|
ParsedChatCompletionMessage,
|
|
16
16
|
)
|
|
17
17
|
import rich
|
|
18
|
-
import
|
|
18
|
+
from tokenizers import Tokenizer # type: ignore[import-untyped]
|
|
19
19
|
from typer import Typer
|
|
20
20
|
import uuid
|
|
21
21
|
|
|
@@ -23,7 +23,7 @@ from .common import CostData, History
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
def get_input_cost(
|
|
26
|
-
cost_map: CostData, enc:
|
|
26
|
+
cost_map: CostData, enc: Tokenizer, history: History
|
|
27
27
|
) -> tuple[float, int]:
|
|
28
28
|
input_tokens = 0
|
|
29
29
|
for msg in history:
|
|
@@ -31,8 +31,8 @@ def get_input_cost(
|
|
|
31
31
|
refusal = msg.get("refusal")
|
|
32
32
|
if isinstance(content, list):
|
|
33
33
|
for part in content:
|
|
34
|
-
if
|
|
35
|
-
input_tokens += len(enc.encode(part[
|
|
34
|
+
if "text" in part:
|
|
35
|
+
input_tokens += len(enc.encode(part["text"]))
|
|
36
36
|
elif content is None:
|
|
37
37
|
if refusal is None:
|
|
38
38
|
raise ValueError("Expected content or refusal to be present")
|
|
@@ -47,7 +47,7 @@ def get_input_cost(
|
|
|
47
47
|
|
|
48
48
|
def get_output_cost(
|
|
49
49
|
cost_map: CostData,
|
|
50
|
-
enc:
|
|
50
|
+
enc: Tokenizer,
|
|
51
51
|
item: ChatCompletionMessage | ChatCompletionMessageParam,
|
|
52
52
|
) -> tuple[float, int]:
|
|
53
53
|
if isinstance(item, ChatCompletionMessage):
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List, Set
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DirectoryTree:
|
|
7
|
+
def __init__(self, root: Path, max_files: int = 10):
|
|
8
|
+
"""
|
|
9
|
+
Initialize the DirectoryTree with a root path and maximum number of files to display
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
root_path: The root directory path to start from
|
|
13
|
+
max_files: Maximum number of files to display in unexpanded directories
|
|
14
|
+
"""
|
|
15
|
+
self.root = root
|
|
16
|
+
self.max_files = max_files
|
|
17
|
+
self.expanded_files: Set[Path] = set()
|
|
18
|
+
self.expanded_dirs = set[Path]()
|
|
19
|
+
|
|
20
|
+
if not self.root.exists():
|
|
21
|
+
raise ValueError(f"Root path {root} does not exist")
|
|
22
|
+
|
|
23
|
+
if not self.root.is_dir():
|
|
24
|
+
raise ValueError(f"Root path {root} is not a directory")
|
|
25
|
+
|
|
26
|
+
def expand(self, rel_path: str) -> None:
|
|
27
|
+
"""
|
|
28
|
+
Expand a specific file in the tree
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
rel_path: Relative path from root to the file to expand
|
|
32
|
+
"""
|
|
33
|
+
abs_path = self.root / rel_path
|
|
34
|
+
|
|
35
|
+
if not abs_path.exists():
|
|
36
|
+
raise ValueError(f"Path {rel_path} does not exist")
|
|
37
|
+
|
|
38
|
+
if not abs_path.is_file():
|
|
39
|
+
raise ValueError(f"Path {rel_path} is not a file")
|
|
40
|
+
|
|
41
|
+
if not str(abs_path).startswith(str(self.root)):
|
|
42
|
+
raise ValueError(f"Path {rel_path} is outside root directory")
|
|
43
|
+
|
|
44
|
+
self.expanded_files.add(abs_path)
|
|
45
|
+
|
|
46
|
+
# Add all parent directories to expanded dirs
|
|
47
|
+
current = abs_path.parent
|
|
48
|
+
while str(current) >= str(self.root):
|
|
49
|
+
if current not in self.expanded_dirs:
|
|
50
|
+
self.expanded_dirs.add(current)
|
|
51
|
+
if current == current.parent:
|
|
52
|
+
break
|
|
53
|
+
current = current.parent
|
|
54
|
+
|
|
55
|
+
def _list_directory(self, dir_path: Path) -> List[Path]:
|
|
56
|
+
"""List contents of a directory, sorted with directories first"""
|
|
57
|
+
contents = list(dir_path.iterdir())
|
|
58
|
+
return sorted(contents, key=lambda x: (not x.is_dir(), x.name.lower()))
|
|
59
|
+
|
|
60
|
+
def _count_hidden_items(
|
|
61
|
+
self, dir_path: Path, shown_items: List[Path]
|
|
62
|
+
) -> tuple[int, int]:
|
|
63
|
+
"""Count hidden files and directories in a directory"""
|
|
64
|
+
all_items = set(self._list_directory(dir_path))
|
|
65
|
+
shown_items_set = set(shown_items)
|
|
66
|
+
hidden_items = all_items - shown_items_set
|
|
67
|
+
|
|
68
|
+
hidden_files = sum(1 for p in hidden_items if p.is_file())
|
|
69
|
+
hidden_dirs = sum(1 for p in hidden_items if p.is_dir())
|
|
70
|
+
|
|
71
|
+
return hidden_files, hidden_dirs
|
|
72
|
+
|
|
73
|
+
def display(self) -> str:
|
|
74
|
+
"""Display the directory tree with expanded state"""
|
|
75
|
+
writer = io.StringIO()
|
|
76
|
+
|
|
77
|
+
def _display_recursive(
|
|
78
|
+
current_path: Path, indent: int = 0, depth: int = 0
|
|
79
|
+
) -> None:
|
|
80
|
+
# Print current directory name
|
|
81
|
+
if current_path == self.root:
|
|
82
|
+
writer.write(f"{current_path}\n")
|
|
83
|
+
else:
|
|
84
|
+
writer.write(f"{' ' * indent}{current_path.name}\n")
|
|
85
|
+
|
|
86
|
+
# Don't recurse beyond depth 1 unless path contains expanded files
|
|
87
|
+
if depth > 0 and current_path not in self.expanded_dirs:
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
# Get directory contents
|
|
91
|
+
contents = self._list_directory(current_path)
|
|
92
|
+
shown_items = []
|
|
93
|
+
|
|
94
|
+
for item in contents:
|
|
95
|
+
# Show items only if:
|
|
96
|
+
# 1. They are expanded files
|
|
97
|
+
# 2. They are parents of expanded items
|
|
98
|
+
should_show = item in self.expanded_files or item in self.expanded_dirs
|
|
99
|
+
|
|
100
|
+
if should_show:
|
|
101
|
+
shown_items.append(item)
|
|
102
|
+
if item.is_dir():
|
|
103
|
+
_display_recursive(item, indent + 2, depth + 1)
|
|
104
|
+
else:
|
|
105
|
+
writer.write(f"{' ' * (indent + 2)}{item.name}\n")
|
|
106
|
+
|
|
107
|
+
# Show hidden items count if any items were hidden
|
|
108
|
+
hidden_files, hidden_dirs = self._count_hidden_items(
|
|
109
|
+
current_path, shown_items
|
|
110
|
+
)
|
|
111
|
+
if hidden_files > 0 or hidden_dirs > 0:
|
|
112
|
+
hidden_msg = []
|
|
113
|
+
if hidden_dirs > 0:
|
|
114
|
+
hidden_msg.append(
|
|
115
|
+
f"{hidden_dirs} director{'ies' if hidden_dirs != 1 else 'y'}"
|
|
116
|
+
)
|
|
117
|
+
if hidden_files > 0:
|
|
118
|
+
hidden_msg.append(
|
|
119
|
+
f"{hidden_files} file{'s' if hidden_files != 1 else ''}"
|
|
120
|
+
)
|
|
121
|
+
writer.write(
|
|
122
|
+
f"{' ' * (indent + 2)}... {' and '.join(hidden_msg)} hidden\n"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
_display_recursive(self.root, depth=0)
|
|
126
|
+
|
|
127
|
+
return writer.getvalue()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
|
|
3
|
+
import tokenizers # type: ignore[import-untyped]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FastPathAnalyzer:
|
|
7
|
+
def __init__(self, model_path: str, vocab_path: str) -> None:
|
|
8
|
+
"""Initialize with vocabulary."""
|
|
9
|
+
# Load vocabulary and probabilities
|
|
10
|
+
self.vocab_probs: Dict[str, float] = {}
|
|
11
|
+
with open(vocab_path, "r") as f:
|
|
12
|
+
for line in f:
|
|
13
|
+
parts = line.strip().split()
|
|
14
|
+
if len(parts) == 2:
|
|
15
|
+
token, prob = parts
|
|
16
|
+
try:
|
|
17
|
+
self.vocab_probs[token] = float(prob)
|
|
18
|
+
except ValueError:
|
|
19
|
+
continue
|
|
20
|
+
|
|
21
|
+
self.encoder = tokenizers.Tokenizer.from_file(model_path)
|
|
22
|
+
|
|
23
|
+
def tokenize_batch(self, texts: List[str]) -> List[List[str]]:
|
|
24
|
+
"""Tokenize multiple texts at once."""
|
|
25
|
+
encodings = self.encoder.encode_batch(texts)
|
|
26
|
+
return [encoding.tokens for encoding in encodings]
|
|
27
|
+
|
|
28
|
+
def detokenize(self, tokens: List[str]) -> str:
|
|
29
|
+
"""Convert tokens back to text, handling special tokens."""
|
|
30
|
+
return self.encoder.decode(tokens) # type: ignore[no-any-return]
|
|
31
|
+
|
|
32
|
+
def calculate_path_probabilities_batch(
|
|
33
|
+
self, paths: List[str]
|
|
34
|
+
) -> List[Tuple[float, List[str], List[str]]]:
|
|
35
|
+
"""Calculate log probability for multiple paths at once."""
|
|
36
|
+
# Batch tokenize all paths
|
|
37
|
+
all_tokens = self.tokenize_batch(paths)
|
|
38
|
+
|
|
39
|
+
results = []
|
|
40
|
+
for tokens in all_tokens:
|
|
41
|
+
# Calculate sum of log probabilities for each path
|
|
42
|
+
log_prob_sum = 0.0
|
|
43
|
+
unknown_tokens = []
|
|
44
|
+
for token in tokens:
|
|
45
|
+
if token in self.vocab_probs:
|
|
46
|
+
log_prob_sum += self.vocab_probs[token]
|
|
47
|
+
else:
|
|
48
|
+
unknown_tokens.append(token)
|
|
49
|
+
|
|
50
|
+
results.append((log_prob_sum, tokens, unknown_tokens))
|
|
51
|
+
|
|
52
|
+
return results
|
|
53
|
+
|
|
54
|
+
def calculate_path_probability(
|
|
55
|
+
self, path: str
|
|
56
|
+
) -> Tuple[float, List[str], List[str]]:
|
|
57
|
+
"""Calculate log probability for a single path."""
|
|
58
|
+
return self.calculate_path_probabilities_batch([path])[0]
|