ai-docs-gen 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_docs/__init__.py +0 -0
- ai_docs/__main__.py +22 -0
- ai_docs/assets/mermaid.min.js +1642 -0
- ai_docs/cache.py +52 -0
- ai_docs/changes.py +25 -0
- ai_docs/cli.py +84 -0
- ai_docs/domain.py +206 -0
- ai_docs/generator.py +959 -0
- ai_docs/llm.py +82 -0
- ai_docs/mkdocs.py +161 -0
- ai_docs/scanner.py +237 -0
- ai_docs/summary.py +238 -0
- ai_docs/tokenizer.py +26 -0
- ai_docs/utils.py +43 -0
- ai_docs_gen-0.1.2.dist-info/METADATA +197 -0
- ai_docs_gen-0.1.2.dist-info/RECORD +19 -0
- ai_docs_gen-0.1.2.dist-info/WHEEL +5 -0
- ai_docs_gen-0.1.2.dist-info/entry_points.txt +2 -0
- ai_docs_gen-0.1.2.dist-info/top_level.txt +1 -0
ai_docs/llm.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import threading
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
from .utils import sha256_text
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LLMClient:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
api_key: str,
|
|
15
|
+
base_url: str,
|
|
16
|
+
model: str,
|
|
17
|
+
temperature: float = 0.2,
|
|
18
|
+
max_tokens: int = 1200,
|
|
19
|
+
context_limit: int = 8192,
|
|
20
|
+
):
|
|
21
|
+
self.api_key = api_key
|
|
22
|
+
self.base_url = base_url.rstrip("/")
|
|
23
|
+
self.model = model
|
|
24
|
+
self.temperature = temperature
|
|
25
|
+
self.max_tokens = max_tokens
|
|
26
|
+
self.context_limit = context_limit
|
|
27
|
+
self._cache_lock = threading.Lock()
|
|
28
|
+
|
|
29
|
+
def _cache_key(self, payload: Dict) -> str:
|
|
30
|
+
return sha256_text(json.dumps(payload, sort_keys=True))
|
|
31
|
+
|
|
32
|
+
def chat(self, messages: List[Dict[str, str]], cache: Optional[Dict[str, str]] = None) -> str:
|
|
33
|
+
payload = {
|
|
34
|
+
"model": self.model,
|
|
35
|
+
"messages": messages,
|
|
36
|
+
"temperature": self.temperature,
|
|
37
|
+
"max_tokens": self.max_tokens,
|
|
38
|
+
}
|
|
39
|
+
key = self._cache_key(payload)
|
|
40
|
+
if cache is not None:
|
|
41
|
+
with self._cache_lock:
|
|
42
|
+
if key in cache:
|
|
43
|
+
return cache[key]
|
|
44
|
+
|
|
45
|
+
if self.base_url.endswith("/v1"):
|
|
46
|
+
url = f"{self.base_url}/chat/completions"
|
|
47
|
+
else:
|
|
48
|
+
url = f"{self.base_url}/v1/chat/completions"
|
|
49
|
+
headers = {
|
|
50
|
+
"Content-Type": "application/json",
|
|
51
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
52
|
+
}
|
|
53
|
+
response = requests.post(url, headers=headers, json=payload, timeout=(120, 480))
|
|
54
|
+
response.raise_for_status()
|
|
55
|
+
data = response.json()
|
|
56
|
+
try:
|
|
57
|
+
content = data["choices"][0]["message"]["content"]
|
|
58
|
+
except Exception as exc:
|
|
59
|
+
raise RuntimeError(f"LLM response missing content: {data}") from exc
|
|
60
|
+
if cache is not None:
|
|
61
|
+
with self._cache_lock:
|
|
62
|
+
cache[key] = content
|
|
63
|
+
return content
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def from_env() -> LLMClient:
|
|
67
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
68
|
+
if not api_key:
|
|
69
|
+
raise RuntimeError("OPENAI_API_KEY is not set")
|
|
70
|
+
base_url = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
|
71
|
+
model = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
|
|
72
|
+
temperature = float(os.getenv("OPENAI_TEMPERATURE", "0.2"))
|
|
73
|
+
max_tokens = int(os.getenv("OPENAI_MAX_TOKENS", "1200"))
|
|
74
|
+
context_limit = int(os.getenv("OPENAI_CONTEXT_TOKENS", "8192"))
|
|
75
|
+
return LLMClient(
|
|
76
|
+
api_key=api_key,
|
|
77
|
+
base_url=base_url,
|
|
78
|
+
model=model,
|
|
79
|
+
temperature=temperature,
|
|
80
|
+
max_tokens=max_tokens,
|
|
81
|
+
context_limit=context_limit,
|
|
82
|
+
)
|
ai_docs/mkdocs.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _YamlPythonName(str):
|
|
8
|
+
pass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _YamlSafeDumper(yaml.SafeDumper):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _python_name_representer(dumper: yaml.Dumper, data: _YamlPythonName) -> yaml.nodes.ScalarNode:
|
|
16
|
+
return dumper.represent_scalar(f"tag:yaml.org,2002:python/name:{data}", "")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_YamlSafeDumper.add_representer(_YamlPythonName, _python_name_representer)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_mkdocs_yaml(
|
|
23
|
+
site_name: str,
|
|
24
|
+
sections: Dict[str, str],
|
|
25
|
+
configs: Dict[str, str],
|
|
26
|
+
local_site: bool = False,
|
|
27
|
+
has_modules: bool = False,
|
|
28
|
+
module_nav_paths: List[str] | None = None,
|
|
29
|
+
project_config_nav_paths: List[str] | None = None,
|
|
30
|
+
) -> str:
|
|
31
|
+
nav = [
|
|
32
|
+
{"Главная": "index.md"},
|
|
33
|
+
]
|
|
34
|
+
if "architecture" in sections:
|
|
35
|
+
nav.append({"Архитектура": "architecture.md"})
|
|
36
|
+
if "runtime" in sections:
|
|
37
|
+
nav.append({"Запуск": "runtime.md"})
|
|
38
|
+
if "dependencies" in sections:
|
|
39
|
+
nav.append({"Зависимости": "dependencies.md"})
|
|
40
|
+
if "testing" in sections:
|
|
41
|
+
nav.append({"Тестирование": "testing.md"})
|
|
42
|
+
if "conventions" in sections:
|
|
43
|
+
nav.append({"Соглашения": "conventions.md"})
|
|
44
|
+
if "glossary" in sections:
|
|
45
|
+
nav.append({"Глоссарий": "glossary.md"})
|
|
46
|
+
|
|
47
|
+
if configs:
|
|
48
|
+
cfg_nav: List[Dict[str, str]] = []
|
|
49
|
+
for key, filename in configs.items():
|
|
50
|
+
title = {
|
|
51
|
+
"kubernetes": "Kubernetes",
|
|
52
|
+
"helm": "Helm",
|
|
53
|
+
"terraform": "Terraform",
|
|
54
|
+
"ansible": "Ansible",
|
|
55
|
+
"docker": "Docker",
|
|
56
|
+
"ci": "CI/CD",
|
|
57
|
+
}.get(key, key)
|
|
58
|
+
cfg_nav.append({title: f"configs/{filename}"})
|
|
59
|
+
nav.append({"Конфиги": cfg_nav})
|
|
60
|
+
|
|
61
|
+
if project_config_nav_paths:
|
|
62
|
+
project_cfg_nav: List[Dict[str, object]] = [{"Обзор": "configs/index.md"}]
|
|
63
|
+
project_cfg_nav.extend(_build_tree_nav(project_config_nav_paths, "configs/files/"))
|
|
64
|
+
nav.append({"Конфигурация проекта": project_cfg_nav})
|
|
65
|
+
|
|
66
|
+
if has_modules:
|
|
67
|
+
modules_nav: List[Dict[str, object]] = [{"Обзор": "modules/index.md"}]
|
|
68
|
+
if module_nav_paths:
|
|
69
|
+
modules_nav.extend(_build_tree_nav(module_nav_paths, "modules/"))
|
|
70
|
+
nav.append({"Модули": modules_nav})
|
|
71
|
+
|
|
72
|
+
nav.append({"Изменения": "changes.md"})
|
|
73
|
+
|
|
74
|
+
data = {
|
|
75
|
+
"site_name": site_name,
|
|
76
|
+
"docs_dir": ".ai-docs",
|
|
77
|
+
"site_dir": "ai_docs_site",
|
|
78
|
+
"plugins": [
|
|
79
|
+
"search",
|
|
80
|
+
{"mermaid2": {"javascript": "js/mermaid.min.js"}},
|
|
81
|
+
],
|
|
82
|
+
"markdown_extensions": [
|
|
83
|
+
"tables",
|
|
84
|
+
"sane_lists",
|
|
85
|
+
"attr_list",
|
|
86
|
+
"def_list",
|
|
87
|
+
"footnotes",
|
|
88
|
+
"admonition",
|
|
89
|
+
"fenced_code",
|
|
90
|
+
{
|
|
91
|
+
"pymdownx.superfences": {
|
|
92
|
+
"custom_fences": [
|
|
93
|
+
{
|
|
94
|
+
"name": "mermaid",
|
|
95
|
+
"class": "mermaid",
|
|
96
|
+
"format": _YamlPythonName("mermaid2.fence_mermaid"),
|
|
97
|
+
}
|
|
98
|
+
]
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
],
|
|
102
|
+
"nav": nav,
|
|
103
|
+
}
|
|
104
|
+
if local_site:
|
|
105
|
+
data["site_url"] = ""
|
|
106
|
+
data["use_directory_urls"] = False
|
|
107
|
+
return yaml.dump(data, allow_unicode=True, sort_keys=False, Dumper=_YamlSafeDumper)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _build_tree_nav(paths: List[str], strip_prefix: str) -> List[Dict[str, object]]:
|
|
111
|
+
tree: Dict[str, object] = {}
|
|
112
|
+
|
|
113
|
+
for rel_path in paths:
|
|
114
|
+
rel = Path(rel_path).as_posix()
|
|
115
|
+
if rel.startswith(strip_prefix):
|
|
116
|
+
rel = rel[len(strip_prefix) :]
|
|
117
|
+
parts = rel.split("/")
|
|
118
|
+
if parts:
|
|
119
|
+
last = Path(parts[-1]).with_suffix("").name
|
|
120
|
+
sep = last.rfind("__")
|
|
121
|
+
if sep != -1 and sep + 2 < len(last):
|
|
122
|
+
base = last[:sep]
|
|
123
|
+
ext = last[sep + 2 :]
|
|
124
|
+
parts[-1] = f"{base}.{ext}"
|
|
125
|
+
else:
|
|
126
|
+
parts[-1] = last
|
|
127
|
+
_insert_nav_node(tree, parts, rel_path)
|
|
128
|
+
|
|
129
|
+
return _tree_to_nav(tree)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _insert_nav_node(tree: Dict[str, object], parts: List[str], rel_path: str) -> None:
|
|
133
|
+
key = parts[0]
|
|
134
|
+
if len(parts) == 1:
|
|
135
|
+
tree[key] = rel_path
|
|
136
|
+
return
|
|
137
|
+
node = tree.get(key)
|
|
138
|
+
if not isinstance(node, dict):
|
|
139
|
+
node = {}
|
|
140
|
+
tree[key] = node
|
|
141
|
+
_insert_nav_node(node, parts[1:], rel_path)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _tree_to_nav(tree: Dict[str, object]) -> List[Dict[str, object]]:
|
|
145
|
+
nav: List[Dict[str, object]] = []
|
|
146
|
+
for key in sorted(tree.keys(), key=lambda k: (not isinstance(tree[k], dict), k.lower())):
|
|
147
|
+
value = tree[key]
|
|
148
|
+
if isinstance(value, dict):
|
|
149
|
+
label = key if key.startswith("/") else f"/{key}"
|
|
150
|
+
nav.append({label: _tree_to_nav(value)})
|
|
151
|
+
else:
|
|
152
|
+
nav.append({key: value})
|
|
153
|
+
return nav
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def write_docs_files(docs_dir: Path, files: Dict[str, str]) -> None:
|
|
157
|
+
docs_dir.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
for rel_path, content in files.items():
|
|
159
|
+
out_path = docs_dir / rel_path
|
|
160
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
161
|
+
out_path.write_text(content, encoding="utf-8")
|
ai_docs/scanner.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
7
|
+
|
|
8
|
+
import pathspec
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
from .domain import (
|
|
12
|
+
CODE_EXTENSION_DESCRIPTIONS,
|
|
13
|
+
CONFIG_EXTENSION_DESCRIPTIONS,
|
|
14
|
+
DOC_EXTENSION_DESCRIPTIONS,
|
|
15
|
+
classify_type,
|
|
16
|
+
detect_domains,
|
|
17
|
+
is_infra,
|
|
18
|
+
)
|
|
19
|
+
from .utils import is_binary_file, is_url, read_text_file, to_posix
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
FIXED_INCLUDE_PATTERNS = {
|
|
23
|
+
"*.tf", "*.tfvars",
|
|
24
|
+
"Dockerfile*", "docker-compose*.yml", "docker-compose*.yaml", "compose.yml", "compose.yaml",
|
|
25
|
+
"Jenkinsfile", ".gitlab-ci.yml", "azure-pipelines.yml",
|
|
26
|
+
"requirements.txt", "pyproject.toml", "package.json", "package-lock.json",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
DEFAULT_EXCLUDE_PATTERNS = {
|
|
30
|
+
".git/*", "**/.git/*",
|
|
31
|
+
".venv/*", ".venv/**", "**/.venv/*", "**/.venv/**",
|
|
32
|
+
"venv/*", "venv/**", "**/venv/*", "**/venv/**",
|
|
33
|
+
"**/node_modules/*",
|
|
34
|
+
"**/dist/*", "**/build/*",
|
|
35
|
+
"**/.idea/*", "**/.vscode/*", "**/__pycache__/*",
|
|
36
|
+
"**/.pytest_cache/*", "**/.mypy_cache/*",
|
|
37
|
+
"**/.ai_docs_cache/*", "**/.ai_docs_cache/**", ".ai_docs_cache/**", ".ai_docs_cache/*",
|
|
38
|
+
"**/ai_docs_site/*", "**/ai_docs_site/**", "ai_docs_site/**", "ai_docs_site/*",
|
|
39
|
+
".ai-docs/*", ".ai-docs/**", "**/.ai-docs/*", "**/.ai-docs/**",
|
|
40
|
+
".github/*", ".github/**", "**/.github/*", "**/.github/**",
|
|
41
|
+
"mkdocs.yml", "**/mkdocs.yml", "mkdocs_yml.md", "**/mkdocs_yml.md",
|
|
42
|
+
".ai-docs.yaml", "**/.ai-docs.yaml",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ScanResult:
|
|
47
|
+
def __init__(self, root: Path, files: List[Dict], source: str, repo_name: str):
|
|
48
|
+
self.root = root
|
|
49
|
+
self.files = files
|
|
50
|
+
self.source = source
|
|
51
|
+
self.repo_name = repo_name
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _normalize_extensions(raw: object, defaults: Dict[str, str]) -> Dict[str, str]:
|
|
55
|
+
normalized: Dict[str, str] = {}
|
|
56
|
+
if isinstance(raw, dict):
|
|
57
|
+
items = raw.items()
|
|
58
|
+
for key, value in items:
|
|
59
|
+
ext = str(key).strip()
|
|
60
|
+
if not ext:
|
|
61
|
+
continue
|
|
62
|
+
if not ext.startswith("."):
|
|
63
|
+
ext = f".{ext}"
|
|
64
|
+
desc = value if isinstance(value, str) and value.strip() else defaults.get(ext, "")
|
|
65
|
+
normalized[ext] = desc
|
|
66
|
+
elif isinstance(raw, list):
|
|
67
|
+
for item in raw:
|
|
68
|
+
ext = str(item).strip()
|
|
69
|
+
if not ext:
|
|
70
|
+
continue
|
|
71
|
+
if not ext.startswith("."):
|
|
72
|
+
ext = f".{ext}"
|
|
73
|
+
normalized[ext] = defaults.get(ext, "")
|
|
74
|
+
return normalized or defaults.copy()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _normalize_excludes(raw: object) -> Set[str]:
|
|
78
|
+
if not isinstance(raw, list):
|
|
79
|
+
return set()
|
|
80
|
+
return {str(item).strip() for item in raw if str(item).strip()}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _load_extension_config(root: Path) -> Dict[str, object]:
|
|
84
|
+
config_path = root / ".ai-docs.yaml"
|
|
85
|
+
defaults = {
|
|
86
|
+
"code_extensions": CODE_EXTENSION_DESCRIPTIONS,
|
|
87
|
+
"doc_extensions": DOC_EXTENSION_DESCRIPTIONS,
|
|
88
|
+
"config_extensions": CONFIG_EXTENSION_DESCRIPTIONS,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if not config_path.exists():
|
|
92
|
+
payload = {
|
|
93
|
+
"code_extensions": defaults["code_extensions"],
|
|
94
|
+
"doc_extensions": defaults["doc_extensions"],
|
|
95
|
+
"config_extensions": defaults["config_extensions"],
|
|
96
|
+
}
|
|
97
|
+
config_path.write_text(
|
|
98
|
+
yaml.safe_dump(payload, allow_unicode=True, sort_keys=False),
|
|
99
|
+
encoding="utf-8",
|
|
100
|
+
)
|
|
101
|
+
return {**{key: value.copy() for key, value in defaults.items()}, "exclude": set()}
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
raw = yaml.safe_load(config_path.read_text(encoding="utf-8", errors="ignore")) or {}
|
|
105
|
+
except yaml.YAMLError:
|
|
106
|
+
return {**{key: value.copy() for key, value in defaults.items()}, "exclude": set()}
|
|
107
|
+
|
|
108
|
+
if not isinstance(raw, dict):
|
|
109
|
+
return {**{key: value.copy() for key, value in defaults.items()}, "exclude": set()}
|
|
110
|
+
|
|
111
|
+
code_raw = raw.get("code_extensions") or {}
|
|
112
|
+
doc_raw = raw.get("doc_extensions") or {}
|
|
113
|
+
config_raw = raw.get("config_extensions") or {}
|
|
114
|
+
exclude_raw = raw.get("exclude") or []
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
"code_extensions": _normalize_extensions(code_raw, defaults["code_extensions"]),
|
|
118
|
+
"doc_extensions": _normalize_extensions(doc_raw, defaults["doc_extensions"]),
|
|
119
|
+
"config_extensions": _normalize_extensions(config_raw, defaults["config_extensions"]),
|
|
120
|
+
"exclude": _normalize_excludes(exclude_raw),
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _build_default_include_patterns(extension_config: Dict[str, object]) -> Set[str]:
|
|
125
|
+
extensions: Set[str] = set()
|
|
126
|
+
for key in ("code_extensions", "doc_extensions", "config_extensions"):
|
|
127
|
+
extensions.update(extension_config.get(key, {}).keys())
|
|
128
|
+
return {f"*{ext}" for ext in extensions} | FIXED_INCLUDE_PATTERNS
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _load_ignore_specs(root: Path) -> List[pathspec.PathSpec]:
|
|
132
|
+
specs: List[pathspec.PathSpec] = []
|
|
133
|
+
for name in (".gitignore", ".build_ignore"):
|
|
134
|
+
ignore_file = root / name
|
|
135
|
+
if not ignore_file.exists():
|
|
136
|
+
continue
|
|
137
|
+
patterns = ignore_file.read_text(encoding="utf-8", errors="ignore").splitlines()
|
|
138
|
+
specs.append(pathspec.PathSpec.from_lines("gitignore", patterns))
|
|
139
|
+
return specs
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _should_include(rel_path: str, include: Optional[Set[str]], exclude: Optional[Set[str]], ignore_specs: List[pathspec.PathSpec]) -> bool:
|
|
143
|
+
for spec in ignore_specs:
|
|
144
|
+
if spec.match_file(rel_path):
|
|
145
|
+
return False
|
|
146
|
+
if exclude:
|
|
147
|
+
for pattern in exclude:
|
|
148
|
+
if pathspec.PathSpec.from_lines("gitignore", [pattern]).match_file(rel_path):
|
|
149
|
+
return False
|
|
150
|
+
if not include:
|
|
151
|
+
return True
|
|
152
|
+
for pattern in include:
|
|
153
|
+
if pathspec.PathSpec.from_lines("gitignore", [pattern]).match_file(rel_path):
|
|
154
|
+
return True
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _scan_directory(root: Path, include: Optional[Set[str]], exclude: Optional[Set[str]], max_size: int) -> List[Dict]:
|
|
159
|
+
files: List[Dict] = []
|
|
160
|
+
ignore_specs = _load_ignore_specs(root)
|
|
161
|
+
|
|
162
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
163
|
+
# Avoid .git directory traversal
|
|
164
|
+
dirnames[:] = [d for d in dirnames if d != ".git"]
|
|
165
|
+
for filename in filenames:
|
|
166
|
+
abs_path = Path(dirpath) / filename
|
|
167
|
+
rel_path = abs_path.relative_to(root)
|
|
168
|
+
rel_path_str = to_posix(rel_path)
|
|
169
|
+
|
|
170
|
+
if not _should_include(rel_path_str, include, exclude, ignore_specs):
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
if abs_path.is_symlink():
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
size = abs_path.stat().st_size
|
|
178
|
+
except OSError:
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
if max_size and size > max_size:
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
if is_binary_file(abs_path):
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
content = read_text_file(abs_path)
|
|
188
|
+
content_snippet = content[:4000]
|
|
189
|
+
file_type = classify_type(abs_path)
|
|
190
|
+
domains = detect_domains(abs_path, content_snippet)
|
|
191
|
+
if is_infra(domains):
|
|
192
|
+
file_type = "infra"
|
|
193
|
+
|
|
194
|
+
files.append(
|
|
195
|
+
{
|
|
196
|
+
"path": rel_path_str,
|
|
197
|
+
"abs_path": abs_path,
|
|
198
|
+
"size": size,
|
|
199
|
+
"content": content,
|
|
200
|
+
"type": file_type,
|
|
201
|
+
"domains": sorted(domains),
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return files
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _clone_repo(repo_url: str) -> Tuple[Path, str]:
|
|
209
|
+
tmpdir = Path(tempfile.mkdtemp(prefix="ai_docs_"))
|
|
210
|
+
try:
|
|
211
|
+
subprocess.check_call(["git", "clone", "--depth", "1", repo_url, str(tmpdir)])
|
|
212
|
+
except Exception as exc:
|
|
213
|
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
214
|
+
raise RuntimeError(f"Failed to clone repo: {exc}")
|
|
215
|
+
repo_name = repo_url.rstrip("/").split("/")[-1].replace(".git", "")
|
|
216
|
+
return tmpdir, repo_name
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def scan_source(source: str, include: Optional[Set[str]] = None, exclude: Optional[Set[str]] = None, max_size: int = 200_000) -> ScanResult:
|
|
220
|
+
exclude = exclude or DEFAULT_EXCLUDE_PATTERNS
|
|
221
|
+
|
|
222
|
+
if is_url(source):
|
|
223
|
+
root, repo_name = _clone_repo(source)
|
|
224
|
+
extension_config = _load_extension_config(root)
|
|
225
|
+
include = include or _build_default_include_patterns(extension_config)
|
|
226
|
+
exclude = set(exclude) | set(extension_config.get("exclude", set()))
|
|
227
|
+
files = _scan_directory(root, include, exclude, max_size)
|
|
228
|
+
return ScanResult(root=root, files=files, source=source, repo_name=repo_name)
|
|
229
|
+
|
|
230
|
+
root = Path(source).expanduser().resolve()
|
|
231
|
+
if not root.exists():
|
|
232
|
+
raise FileNotFoundError(f"Source path not found: {root}")
|
|
233
|
+
extension_config = _load_extension_config(root)
|
|
234
|
+
include = include or _build_default_include_patterns(extension_config)
|
|
235
|
+
exclude = set(exclude) | set(extension_config.get("exclude", set()))
|
|
236
|
+
files = _scan_directory(root, include, exclude, max_size)
|
|
237
|
+
return ScanResult(root=root, files=files, source=str(root), repo_name=root.name)
|