skill-seekers 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skill_seekers/__init__.py +22 -0
- skill_seekers/cli/__init__.py +39 -0
- skill_seekers/cli/adaptors/__init__.py +120 -0
- skill_seekers/cli/adaptors/base.py +221 -0
- skill_seekers/cli/adaptors/claude.py +485 -0
- skill_seekers/cli/adaptors/gemini.py +453 -0
- skill_seekers/cli/adaptors/markdown.py +269 -0
- skill_seekers/cli/adaptors/openai.py +503 -0
- skill_seekers/cli/ai_enhancer.py +310 -0
- skill_seekers/cli/api_reference_builder.py +373 -0
- skill_seekers/cli/architectural_pattern_detector.py +525 -0
- skill_seekers/cli/code_analyzer.py +1462 -0
- skill_seekers/cli/codebase_scraper.py +1225 -0
- skill_seekers/cli/config_command.py +563 -0
- skill_seekers/cli/config_enhancer.py +431 -0
- skill_seekers/cli/config_extractor.py +871 -0
- skill_seekers/cli/config_manager.py +452 -0
- skill_seekers/cli/config_validator.py +394 -0
- skill_seekers/cli/conflict_detector.py +528 -0
- skill_seekers/cli/constants.py +72 -0
- skill_seekers/cli/dependency_analyzer.py +757 -0
- skill_seekers/cli/doc_scraper.py +2332 -0
- skill_seekers/cli/enhance_skill.py +488 -0
- skill_seekers/cli/enhance_skill_local.py +1096 -0
- skill_seekers/cli/enhance_status.py +194 -0
- skill_seekers/cli/estimate_pages.py +433 -0
- skill_seekers/cli/generate_router.py +1209 -0
- skill_seekers/cli/github_fetcher.py +534 -0
- skill_seekers/cli/github_scraper.py +1466 -0
- skill_seekers/cli/guide_enhancer.py +723 -0
- skill_seekers/cli/how_to_guide_builder.py +1267 -0
- skill_seekers/cli/install_agent.py +461 -0
- skill_seekers/cli/install_skill.py +178 -0
- skill_seekers/cli/language_detector.py +614 -0
- skill_seekers/cli/llms_txt_detector.py +60 -0
- skill_seekers/cli/llms_txt_downloader.py +104 -0
- skill_seekers/cli/llms_txt_parser.py +150 -0
- skill_seekers/cli/main.py +558 -0
- skill_seekers/cli/markdown_cleaner.py +132 -0
- skill_seekers/cli/merge_sources.py +806 -0
- skill_seekers/cli/package_multi.py +77 -0
- skill_seekers/cli/package_skill.py +241 -0
- skill_seekers/cli/pattern_recognizer.py +1825 -0
- skill_seekers/cli/pdf_extractor_poc.py +1166 -0
- skill_seekers/cli/pdf_scraper.py +617 -0
- skill_seekers/cli/quality_checker.py +519 -0
- skill_seekers/cli/rate_limit_handler.py +438 -0
- skill_seekers/cli/resume_command.py +160 -0
- skill_seekers/cli/run_tests.py +230 -0
- skill_seekers/cli/setup_wizard.py +93 -0
- skill_seekers/cli/split_config.py +390 -0
- skill_seekers/cli/swift_patterns.py +560 -0
- skill_seekers/cli/test_example_extractor.py +1081 -0
- skill_seekers/cli/test_unified_simple.py +179 -0
- skill_seekers/cli/unified_codebase_analyzer.py +572 -0
- skill_seekers/cli/unified_scraper.py +932 -0
- skill_seekers/cli/unified_skill_builder.py +1605 -0
- skill_seekers/cli/upload_skill.py +162 -0
- skill_seekers/cli/utils.py +432 -0
- skill_seekers/mcp/__init__.py +33 -0
- skill_seekers/mcp/agent_detector.py +316 -0
- skill_seekers/mcp/git_repo.py +273 -0
- skill_seekers/mcp/server.py +231 -0
- skill_seekers/mcp/server_fastmcp.py +1249 -0
- skill_seekers/mcp/server_legacy.py +2302 -0
- skill_seekers/mcp/source_manager.py +285 -0
- skill_seekers/mcp/tools/__init__.py +115 -0
- skill_seekers/mcp/tools/config_tools.py +251 -0
- skill_seekers/mcp/tools/packaging_tools.py +826 -0
- skill_seekers/mcp/tools/scraping_tools.py +842 -0
- skill_seekers/mcp/tools/source_tools.py +828 -0
- skill_seekers/mcp/tools/splitting_tools.py +212 -0
- skill_seekers/py.typed +0 -0
- skill_seekers-2.7.3.dist-info/METADATA +2027 -0
- skill_seekers-2.7.3.dist-info/RECORD +79 -0
- skill_seekers-2.7.3.dist-info/WHEEL +5 -0
- skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
- skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
- skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Unified Language Detection for Code Blocks
|
|
4
|
+
|
|
5
|
+
Provides confidence-based language detection for documentation scrapers.
|
|
6
|
+
Supports 20+ programming languages with weighted pattern matching.
|
|
7
|
+
|
|
8
|
+
Author: Skill Seekers Project
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Import Swift patterns from separate module (fork-friendly architecture)
|
|
17
|
+
try:
|
|
18
|
+
from skill_seekers.cli.swift_patterns import SWIFT_PATTERNS
|
|
19
|
+
except ImportError as e:
|
|
20
|
+
logger.warning(
|
|
21
|
+
"Swift language detection patterns unavailable. Swift code detection will be disabled. Error: %s",
|
|
22
|
+
e,
|
|
23
|
+
)
|
|
24
|
+
SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {}
|
|
25
|
+
except Exception as e:
|
|
26
|
+
logger.error(
|
|
27
|
+
"Failed to load Swift patterns due to unexpected error: %s. Swift detection disabled.", e
|
|
28
|
+
)
|
|
29
|
+
SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {}
|
|
30
|
+
|
|
31
|
+
# Verify Swift patterns were loaded correctly
|
|
32
|
+
if not SWIFT_PATTERNS:
|
|
33
|
+
logger.warning(
|
|
34
|
+
"Swift pattern dictionary is empty. Swift detection is disabled. "
|
|
35
|
+
"This may indicate swift_patterns.py has no patterns defined."
|
|
36
|
+
)
|
|
37
|
+
elif "swift" not in SWIFT_PATTERNS:
|
|
38
|
+
logger.error(
|
|
39
|
+
"Swift patterns loaded but 'swift' key is missing. Swift detection is broken. Please file a bug report."
|
|
40
|
+
)
|
|
41
|
+
else:
|
|
42
|
+
logger.info(
|
|
43
|
+
"Swift patterns loaded successfully: %d patterns for language detection",
|
|
44
|
+
len(SWIFT_PATTERNS.get("swift", [])),
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Comprehensive language patterns with weighted confidence scoring
|
|
48
|
+
# Weight 5: Unique identifiers (highly specific)
|
|
49
|
+
# Weight 4: Strong indicators
|
|
50
|
+
# Weight 3: Common patterns
|
|
51
|
+
# Weight 2: Moderate indicators
|
|
52
|
+
# Weight 1: Weak indicators
|
|
53
|
+
|
|
54
|
+
LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = {
|
|
55
|
+
# ===== PRIORITY 1: Unity C# (Critical - User's Primary Issue) =====
|
|
56
|
+
"csharp": [
|
|
57
|
+
# Unity-specific patterns (weight 4-5, CRITICAL)
|
|
58
|
+
(r"\busing\s+UnityEngine", 5),
|
|
59
|
+
(r"\bMonoBehaviour\b", 5),
|
|
60
|
+
(r"\bGameObject\b", 4),
|
|
61
|
+
(r"\bTransform\b", 4),
|
|
62
|
+
(r"\bVector[23]\b", 3),
|
|
63
|
+
(r"\bQuaternion\b", 3),
|
|
64
|
+
(r"\bvoid\s+Start\s*\(\)", 4),
|
|
65
|
+
(r"\bvoid\s+Update\s*\(\)", 4),
|
|
66
|
+
(r"\bvoid\s+Awake\s*\(\)", 4),
|
|
67
|
+
(r"\bvoid\s+OnEnable\s*\(\)", 3),
|
|
68
|
+
(r"\bvoid\s+OnDisable\s*\(\)", 3),
|
|
69
|
+
(r"\bvoid\s+FixedUpdate\s*\(\)", 4),
|
|
70
|
+
(r"\bvoid\s+LateUpdate\s*\(\)", 4),
|
|
71
|
+
(r"\bvoid\s+OnCollisionEnter", 4),
|
|
72
|
+
(r"\bvoid\s+OnTriggerEnter", 4),
|
|
73
|
+
(r"\bIEnumerator\b", 4),
|
|
74
|
+
(r"\bStartCoroutine\s*\(", 4),
|
|
75
|
+
(r"\byield\s+return\s+new\s+WaitForSeconds", 4),
|
|
76
|
+
(r"\byield\s+return\s+null", 3),
|
|
77
|
+
(r"\byield\s+return", 4),
|
|
78
|
+
(r"\[SerializeField\]", 4),
|
|
79
|
+
(r"\[RequireComponent", 4),
|
|
80
|
+
(r"\[Header\(", 3),
|
|
81
|
+
(r"\[Range\(", 3),
|
|
82
|
+
(r"\bTime\.deltaTime\b", 4),
|
|
83
|
+
(r"\bInput\.Get", 4),
|
|
84
|
+
(r"\bRigidbody\b", 3),
|
|
85
|
+
(r"\bCollider\b", 3),
|
|
86
|
+
(r"\bRenderer\b", 3),
|
|
87
|
+
(r"\bGetComponent<", 3),
|
|
88
|
+
# Basic C# patterns (weight 2-4)
|
|
89
|
+
(r"\bnamespace\s+\w+", 3),
|
|
90
|
+
(r"\busing\s+System", 3),
|
|
91
|
+
(r"\bConsole\.WriteLine", 4), # C#-specific output
|
|
92
|
+
(r"\bConsole\.Write", 3),
|
|
93
|
+
(r"\bpublic\s+class\s+\w+", 4), # Increased to match Java weight
|
|
94
|
+
(r"\bprivate\s+class\s+\w+", 3),
|
|
95
|
+
(r"\binternal\s+class\s+\w+", 4), # C#-specific modifier
|
|
96
|
+
(r"\bstring\s+\w+\s*[;=]", 2), # C#-specific lowercase string
|
|
97
|
+
(r"\bprivate\s+\w+\s+\w+\s*;", 2), # Private fields (common in both C# and Java)
|
|
98
|
+
(r"\{\s*get;\s*set;\s*\}", 3), # Auto properties
|
|
99
|
+
(r"\{\s*get;\s*private\s+set;\s*\}", 3),
|
|
100
|
+
(r"\{\s*get\s*=>\s*", 2), # Expression properties
|
|
101
|
+
(r"\bpublic\s+static\s+void\s+", 2),
|
|
102
|
+
# Modern C# patterns (weight 2)
|
|
103
|
+
(r"\bfrom\s+\w+\s+in\s+", 2), # LINQ
|
|
104
|
+
(r"\.Where\s*\(", 2),
|
|
105
|
+
(r"\.Select\s*\(", 2),
|
|
106
|
+
(r"\basync\s+Task", 2),
|
|
107
|
+
(r"\bawait\s+", 2),
|
|
108
|
+
(r"\bvar\s+\w+\s*=", 1),
|
|
109
|
+
],
|
|
110
|
+
# ===== PRIORITY 2: Frontend Languages =====
|
|
111
|
+
"typescript": [
|
|
112
|
+
# TypeScript-specific (weight 4-5)
|
|
113
|
+
(r"\binterface\s+\w+\s*\{", 5),
|
|
114
|
+
(r"\btype\s+\w+\s*=", 4),
|
|
115
|
+
(r":\s*\w+\s*=", 3), # Type annotation
|
|
116
|
+
(r":\s*\w+\[\]", 3), # Array type
|
|
117
|
+
(r"<[\w,\s]+>", 2), # Generic type
|
|
118
|
+
(r"\bas\s+\w+", 2), # Type assertion
|
|
119
|
+
(r"\benum\s+\w+\s*\{", 4),
|
|
120
|
+
(r"\bimplements\s+\w+", 3),
|
|
121
|
+
(r"\bexport\s+interface", 4),
|
|
122
|
+
(r"\bexport\s+type", 4),
|
|
123
|
+
# Also has JS patterns (weight 1)
|
|
124
|
+
(r"\bconst\s+\w+\s*=", 1),
|
|
125
|
+
(r"\blet\s+\w+\s*=", 1),
|
|
126
|
+
(r"=>", 1),
|
|
127
|
+
],
|
|
128
|
+
"javascript": [
|
|
129
|
+
(r"\bfunction\s+\w+\s*\(", 3),
|
|
130
|
+
(r"\bconst\s+\w+\s*=", 2),
|
|
131
|
+
(r"\blet\s+\w+\s*=", 2),
|
|
132
|
+
(r"=>", 2), # Arrow function
|
|
133
|
+
(r"\bconsole\.log", 2),
|
|
134
|
+
(r"\bvar\s+\w+\s*=", 1),
|
|
135
|
+
(r"\.then\s*\(", 2), # Promise
|
|
136
|
+
(r"\.catch\s*\(", 2), # Promise
|
|
137
|
+
(r"\basync\s+function", 3),
|
|
138
|
+
(r"\bawait\s+", 2),
|
|
139
|
+
(r"require\s*\(", 2), # CommonJS
|
|
140
|
+
(r"\bexport\s+default", 2), # ES6
|
|
141
|
+
(r"\bexport\s+const", 2),
|
|
142
|
+
],
|
|
143
|
+
"jsx": [
|
|
144
|
+
# JSX patterns (weight 4-5)
|
|
145
|
+
(r"<\w+\s+[^>]*>", 4), # JSX tag with attributes
|
|
146
|
+
(r"<\w+\s*/>", 4), # Self-closing tag
|
|
147
|
+
(r"className=", 3), # React className
|
|
148
|
+
(r"onClick=", 3), # React event
|
|
149
|
+
(r"\brender\s*\(\s*\)\s*\{", 4), # React render
|
|
150
|
+
(r"\buseState\s*\(", 4), # React hook
|
|
151
|
+
(r"\buseEffect\s*\(", 4), # React hook
|
|
152
|
+
(r"\buseRef\s*\(", 3),
|
|
153
|
+
(r"\buseCallback\s*\(", 3),
|
|
154
|
+
(r"\buseMemo\s*\(", 3),
|
|
155
|
+
# Also has JS patterns
|
|
156
|
+
(r"\bconst\s+\w+\s*=", 1),
|
|
157
|
+
(r"=>", 1),
|
|
158
|
+
],
|
|
159
|
+
"tsx": [
|
|
160
|
+
# TSX = TypeScript + JSX (weight 5)
|
|
161
|
+
(r"<\w+\s+[^>]*>", 3), # JSX tag
|
|
162
|
+
(r":\s*React\.\w+", 5), # React types
|
|
163
|
+
(r"interface\s+\w+Props", 5), # Props interface
|
|
164
|
+
(r"\bFunctionComponent<", 4),
|
|
165
|
+
(r"\bReact\.FC<", 4),
|
|
166
|
+
(r"\buseState<", 4), # Typed hook
|
|
167
|
+
(r"\buseRef<", 3),
|
|
168
|
+
# Also has TS patterns
|
|
169
|
+
(r"\binterface\s+\w+", 2),
|
|
170
|
+
(r"\btype\s+\w+\s*=", 2),
|
|
171
|
+
],
|
|
172
|
+
"vue": [
|
|
173
|
+
# Vue SFC patterns (weight 4-5)
|
|
174
|
+
(r"<template>", 5),
|
|
175
|
+
(r"<script>", 3),
|
|
176
|
+
(r"<style\s+scoped>", 4),
|
|
177
|
+
(r"\bexport\s+default\s*\{", 3),
|
|
178
|
+
(r"\bdata\s*\(\s*\)\s*\{", 4), # Vue 2
|
|
179
|
+
(r"\bcomputed\s*:", 3),
|
|
180
|
+
(r"\bmethods\s*:", 3),
|
|
181
|
+
(r"\bsetup\s*\(", 4), # Vue 3 Composition
|
|
182
|
+
(r"\bref\s*\(", 4), # Vue 3
|
|
183
|
+
(r"\breactive\s*\(", 4), # Vue 3
|
|
184
|
+
(r"v-bind:", 3),
|
|
185
|
+
(r"v-for=", 3),
|
|
186
|
+
(r"v-if=", 3),
|
|
187
|
+
(r"v-model=", 3),
|
|
188
|
+
],
|
|
189
|
+
# ===== PRIORITY 3: Backend Languages =====
|
|
190
|
+
"java": [
|
|
191
|
+
(r"\bpublic\s+class\s+\w+", 4),
|
|
192
|
+
(r"\bprivate\s+\w+\s+\w+", 2),
|
|
193
|
+
(r"\bSystem\.out\.println", 3),
|
|
194
|
+
(r"\bpublic\s+static\s+void\s+main", 4),
|
|
195
|
+
(r"\bpublic\s+\w+\s+\w+\s*\(", 2),
|
|
196
|
+
(r"@Override", 3),
|
|
197
|
+
(r"@Autowired", 3), # Spring
|
|
198
|
+
(r"@Service", 3), # Spring
|
|
199
|
+
(r"@RestController", 3), # Spring
|
|
200
|
+
(r"@GetMapping", 3), # Spring
|
|
201
|
+
(r"@PostMapping", 3), # Spring
|
|
202
|
+
(r"\bimport\s+java\.", 2),
|
|
203
|
+
(r"\bextends\s+\w+", 2),
|
|
204
|
+
],
|
|
205
|
+
"go": [
|
|
206
|
+
(r"\bfunc\s+\w+\s*\(", 3),
|
|
207
|
+
(r"\bpackage\s+\w+", 4),
|
|
208
|
+
(r":=", 3), # Short declaration
|
|
209
|
+
(r"\bfmt\.Print", 2),
|
|
210
|
+
(r"\bfunc\s+\(.*\)\s+\w+\s*\(", 4), # Method
|
|
211
|
+
(r"\bdefer\s+", 3),
|
|
212
|
+
(r"\bgo\s+\w+\s*\(", 3), # Goroutine
|
|
213
|
+
(r"\bchan\s+", 3), # Channel
|
|
214
|
+
(r"\binterface\{\}", 2), # Empty interface
|
|
215
|
+
(r"\bfunc\s+main\s*\(\)", 4),
|
|
216
|
+
],
|
|
217
|
+
"rust": [
|
|
218
|
+
(r"\bfn\s+\w+\s*\(", 4),
|
|
219
|
+
(r"\blet\s+mut\s+\w+", 3),
|
|
220
|
+
(r"\bprintln!", 3),
|
|
221
|
+
(r"\bimpl\s+\w+", 3),
|
|
222
|
+
(r"\buse\s+\w+::", 3),
|
|
223
|
+
(r"\bpub\s+fn\s+", 3),
|
|
224
|
+
(r"\bmatch\s+\w+\s*\{", 3),
|
|
225
|
+
(r"\bSome\(", 2),
|
|
226
|
+
(r"\bNone\b", 2),
|
|
227
|
+
(r"\bResult<", 3),
|
|
228
|
+
(r"\bOption<", 3),
|
|
229
|
+
(r"&str\b", 2),
|
|
230
|
+
(r"\bfn\s+main\s*\(\)", 4),
|
|
231
|
+
],
|
|
232
|
+
"php": [
|
|
233
|
+
(r"<\?php", 5),
|
|
234
|
+
(r"\$\w+\s*=", 2),
|
|
235
|
+
(r"\bfunction\s+\w+\s*\(", 2),
|
|
236
|
+
(r"\bpublic\s+function", 3),
|
|
237
|
+
(r"\bprivate\s+function", 3),
|
|
238
|
+
(r"\bclass\s+\w+", 3),
|
|
239
|
+
(r"\bnamespace\s+\w+", 3),
|
|
240
|
+
(r"\buse\s+\w+\\", 2),
|
|
241
|
+
(r"->", 2), # Object operator
|
|
242
|
+
(r"::", 1), # Static operator
|
|
243
|
+
],
|
|
244
|
+
# ===== PRIORITY 4: System/Data Languages =====
|
|
245
|
+
"python": [
|
|
246
|
+
(r"\bdef\s+\w+\s*\(", 3),
|
|
247
|
+
(r"\bimport\s+\w+", 2),
|
|
248
|
+
(r"\bclass\s+\w+:", 3),
|
|
249
|
+
(r"\bfrom\s+\w+\s+import", 2),
|
|
250
|
+
(r":\s*$", 1), # Lines ending with :
|
|
251
|
+
(r"@\w+", 2), # Decorator
|
|
252
|
+
(r"\bself\.\w+", 2),
|
|
253
|
+
(r"\b__init__\s*\(", 3),
|
|
254
|
+
(r"\basync\s+def\s+", 3),
|
|
255
|
+
(r"\bawait\s+", 2),
|
|
256
|
+
(r"\bprint\s*\(", 1),
|
|
257
|
+
],
|
|
258
|
+
"r": [
|
|
259
|
+
(r"<-", 4), # Assignment operator
|
|
260
|
+
(r"\bfunction\s*\(", 2),
|
|
261
|
+
(r"\blibrary\s*\(", 3),
|
|
262
|
+
(r"\bggplot\s*\(", 4), # ggplot2
|
|
263
|
+
(r"\bdata\.frame\s*\(", 3),
|
|
264
|
+
(r"\%>\%", 4), # Pipe operator
|
|
265
|
+
(r"\bsummary\s*\(", 2),
|
|
266
|
+
(r"\bread\.csv\s*\(", 3),
|
|
267
|
+
],
|
|
268
|
+
"julia": [
|
|
269
|
+
(r"\bfunction\s+\w+\s*\(", 3),
|
|
270
|
+
(r"\bend\b", 2),
|
|
271
|
+
(r"\busing\s+\w+", 3),
|
|
272
|
+
(r"::", 2), # Type annotation
|
|
273
|
+
(r"\bmodule\s+\w+", 3),
|
|
274
|
+
(r"\babstract\s+type", 3),
|
|
275
|
+
(r"\bstruct\s+\w+", 3),
|
|
276
|
+
],
|
|
277
|
+
"sql": [
|
|
278
|
+
(r"\bSELECT\s+", 4),
|
|
279
|
+
(r"\bFROM\s+", 3),
|
|
280
|
+
(r"\bWHERE\s+", 2),
|
|
281
|
+
(r"\bINSERT\s+INTO", 4),
|
|
282
|
+
(r"\bCREATE\s+TABLE", 4),
|
|
283
|
+
(r"\bJOIN\s+", 3),
|
|
284
|
+
(r"\bGROUP\s+BY", 3),
|
|
285
|
+
(r"\bORDER\s+BY", 3),
|
|
286
|
+
(r"\bUPDATE\s+", 3),
|
|
287
|
+
(r"\bDELETE\s+FROM", 3),
|
|
288
|
+
],
|
|
289
|
+
# ===== Additional Languages =====
|
|
290
|
+
"cpp": [
|
|
291
|
+
(r"#include\s*<", 4),
|
|
292
|
+
(r"\bstd::", 3),
|
|
293
|
+
(r"\bnamespace\s+\w+", 3),
|
|
294
|
+
(r"\bcout\s*<<", 3),
|
|
295
|
+
(r"\bvoid\s+\w+\s*\(", 2),
|
|
296
|
+
(r"\bint\s+main\s*\(", 4),
|
|
297
|
+
(r"->", 2), # Pointer
|
|
298
|
+
],
|
|
299
|
+
"c": [
|
|
300
|
+
(r"#include\s*<", 4),
|
|
301
|
+
(r"\bprintf\s*\(", 3),
|
|
302
|
+
(r"\bint\s+main\s*\(", 4),
|
|
303
|
+
(r"\bvoid\s+\w+\s*\(", 2),
|
|
304
|
+
(r"\bstruct\s+\w+", 3),
|
|
305
|
+
],
|
|
306
|
+
"gdscript": [
|
|
307
|
+
(r"\bfunc\s+\w+\s*\(", 3),
|
|
308
|
+
(r"\bvar\s+\w+\s*=", 3),
|
|
309
|
+
(r"\bextends\s+\w+", 4),
|
|
310
|
+
(r"\b_ready\s*\(", 4),
|
|
311
|
+
(r"\b_process\s*\(", 4),
|
|
312
|
+
],
|
|
313
|
+
# ===== Markup/Config Languages =====
|
|
314
|
+
"html": [
|
|
315
|
+
(r"<!DOCTYPE\s+html>", 5),
|
|
316
|
+
(r"<html", 4),
|
|
317
|
+
(r"<head>", 3),
|
|
318
|
+
(r"<body>", 3),
|
|
319
|
+
(r"<div", 2),
|
|
320
|
+
(r"<span", 2),
|
|
321
|
+
(r"<script", 2),
|
|
322
|
+
],
|
|
323
|
+
"css": [
|
|
324
|
+
(r"\{\s*[\w-]+\s*:", 3),
|
|
325
|
+
(r"@media", 3),
|
|
326
|
+
(r"\.[\w-]+\s*\{", 2),
|
|
327
|
+
(r"#[\w-]+\s*\{", 2),
|
|
328
|
+
(r"@import", 2),
|
|
329
|
+
],
|
|
330
|
+
"json": [
|
|
331
|
+
(r"^\s*\{", 3),
|
|
332
|
+
(r"^\s*\[", 3),
|
|
333
|
+
(r'"\w+"\s*:', 3),
|
|
334
|
+
(r':\s*["\d\[\{]', 2),
|
|
335
|
+
],
|
|
336
|
+
"yaml": [
|
|
337
|
+
(r"^\w+:", 3),
|
|
338
|
+
(r"^\s+-\s+\w+", 2),
|
|
339
|
+
(r"---", 2),
|
|
340
|
+
(r"^\s+\w+:", 2),
|
|
341
|
+
],
|
|
342
|
+
"xml": [
|
|
343
|
+
(r"<\?xml", 5),
|
|
344
|
+
(r"<\w+\s+\w+=", 2),
|
|
345
|
+
(r"<\w+>", 1),
|
|
346
|
+
(r"</\w+>", 1),
|
|
347
|
+
],
|
|
348
|
+
"markdown": [
|
|
349
|
+
(r"^#+\s+", 3),
|
|
350
|
+
(r"^\*\*\w+\*\*", 2),
|
|
351
|
+
(r"^\s*[-*]\s+", 2),
|
|
352
|
+
(r"\[.*\]\(.*\)", 2),
|
|
353
|
+
],
|
|
354
|
+
"bash": [
|
|
355
|
+
(r"#!/bin/bash", 5),
|
|
356
|
+
(r"#!/bin/sh", 5),
|
|
357
|
+
(r"\becho\s+", 2),
|
|
358
|
+
(r"\$\{?\w+\}?", 2),
|
|
359
|
+
(r"\bif\s+\[", 2),
|
|
360
|
+
(r"\bfor\s+\w+\s+in", 2),
|
|
361
|
+
],
|
|
362
|
+
"shell": [
|
|
363
|
+
(r"#!/bin/bash", 5),
|
|
364
|
+
(r"#!/bin/sh", 5),
|
|
365
|
+
(r"\becho\s+", 2),
|
|
366
|
+
(r"\$\{?\w+\}?", 2),
|
|
367
|
+
],
|
|
368
|
+
"powershell": [
|
|
369
|
+
(r"\$\w+\s*=", 2),
|
|
370
|
+
(r"Get-\w+", 3),
|
|
371
|
+
(r"Set-\w+", 3),
|
|
372
|
+
(r"\bWrite-Host\s+", 2),
|
|
373
|
+
],
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
# Merge Swift patterns (fork-friendly: patterns defined in swift_patterns.py)
|
|
377
|
+
LANGUAGE_PATTERNS.update(SWIFT_PATTERNS)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
# Known language list for CSS class detection
|
|
381
|
+
KNOWN_LANGUAGES = [
|
|
382
|
+
"javascript",
|
|
383
|
+
"java",
|
|
384
|
+
"xml",
|
|
385
|
+
"html",
|
|
386
|
+
"python",
|
|
387
|
+
"bash",
|
|
388
|
+
"cpp",
|
|
389
|
+
"typescript",
|
|
390
|
+
"go",
|
|
391
|
+
"rust",
|
|
392
|
+
"php",
|
|
393
|
+
"ruby",
|
|
394
|
+
"swift",
|
|
395
|
+
"kotlin",
|
|
396
|
+
"csharp",
|
|
397
|
+
"c",
|
|
398
|
+
"sql",
|
|
399
|
+
"yaml",
|
|
400
|
+
"json",
|
|
401
|
+
"markdown",
|
|
402
|
+
"css",
|
|
403
|
+
"scss",
|
|
404
|
+
"sass",
|
|
405
|
+
"jsx",
|
|
406
|
+
"tsx",
|
|
407
|
+
"vue",
|
|
408
|
+
"shell",
|
|
409
|
+
"powershell",
|
|
410
|
+
"r",
|
|
411
|
+
"scala",
|
|
412
|
+
"dart",
|
|
413
|
+
"perl",
|
|
414
|
+
"lua",
|
|
415
|
+
"elixir",
|
|
416
|
+
"julia",
|
|
417
|
+
"gdscript",
|
|
418
|
+
]
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
class LanguageDetector:
|
|
422
|
+
"""
|
|
423
|
+
Unified confidence-based language detection for code blocks.
|
|
424
|
+
|
|
425
|
+
Supports 20+ programming languages with weighted pattern matching.
|
|
426
|
+
Uses two-stage detection:
|
|
427
|
+
1. CSS class extraction (high confidence = 1.0)
|
|
428
|
+
2. Pattern-based heuristics with confidence scoring (0.0-1.0)
|
|
429
|
+
|
|
430
|
+
Example:
|
|
431
|
+
detector = LanguageDetector(min_confidence=0.3)
|
|
432
|
+
lang, confidence = detector.detect_from_html(elem, code)
|
|
433
|
+
|
|
434
|
+
if confidence >= 0.7:
|
|
435
|
+
print(f"High confidence: {lang}")
|
|
436
|
+
elif confidence >= 0.5:
|
|
437
|
+
print(f"Medium confidence: {lang}")
|
|
438
|
+
else:
|
|
439
|
+
print(f"Low confidence: {lang}")
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
def __init__(self, min_confidence: float = 0.15):
|
|
443
|
+
"""
|
|
444
|
+
Initialize language detector.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
min_confidence: Minimum confidence threshold (0-1)
|
|
448
|
+
0.3 = low, 0.5 = medium, 0.7 = high
|
|
449
|
+
"""
|
|
450
|
+
self.min_confidence = min_confidence
|
|
451
|
+
self._pattern_cache: dict[str, list[tuple[re.Pattern, int]]] = {}
|
|
452
|
+
self._compile_patterns()
|
|
453
|
+
|
|
454
|
+
def _compile_patterns(self) -> None:
|
|
455
|
+
"""Compile regex patterns and cache them for performance"""
|
|
456
|
+
for lang, patterns in LANGUAGE_PATTERNS.items():
|
|
457
|
+
compiled_patterns = []
|
|
458
|
+
for i, (pattern, weight) in enumerate(patterns):
|
|
459
|
+
try:
|
|
460
|
+
compiled = re.compile(pattern, re.IGNORECASE | re.MULTILINE)
|
|
461
|
+
compiled_patterns.append((compiled, weight))
|
|
462
|
+
except re.error as e:
|
|
463
|
+
logger.error(
|
|
464
|
+
"Invalid regex pattern for language '%s' at index %d: '%s'. Error: %s. Pattern skipped.",
|
|
465
|
+
lang,
|
|
466
|
+
i,
|
|
467
|
+
pattern[:50],
|
|
468
|
+
e,
|
|
469
|
+
)
|
|
470
|
+
except TypeError:
|
|
471
|
+
logger.error(
|
|
472
|
+
"Pattern for language '%s' at index %d is not a string: %s. Pattern skipped.",
|
|
473
|
+
lang,
|
|
474
|
+
i,
|
|
475
|
+
type(pattern).__name__,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
if compiled_patterns:
|
|
479
|
+
self._pattern_cache[lang] = compiled_patterns
|
|
480
|
+
else:
|
|
481
|
+
logger.warning(
|
|
482
|
+
"No valid patterns compiled for language '%s'. Detection for this language is disabled.",
|
|
483
|
+
lang,
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
def detect_from_html(self, elem, code: str) -> tuple[str, float]:
|
|
487
|
+
"""
|
|
488
|
+
Detect language from HTML element with CSS classes + code content.
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
elem: BeautifulSoup element with 'class' attribute
|
|
492
|
+
code: Code content string
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
Tuple of (language, confidence) where confidence is 0.0-1.0
|
|
496
|
+
"""
|
|
497
|
+
# Tier 1: CSS classes (confidence 1.0)
|
|
498
|
+
if elem:
|
|
499
|
+
css_lang = self.extract_language_from_classes(elem.get("class", []))
|
|
500
|
+
if css_lang:
|
|
501
|
+
return css_lang, 1.0
|
|
502
|
+
|
|
503
|
+
# Check parent pre element
|
|
504
|
+
parent = elem.parent
|
|
505
|
+
if parent and parent.name == "pre":
|
|
506
|
+
css_lang = self.extract_language_from_classes(parent.get("class", []))
|
|
507
|
+
if css_lang:
|
|
508
|
+
return css_lang, 1.0
|
|
509
|
+
|
|
510
|
+
# Tier 2: Pattern matching
|
|
511
|
+
return self.detect_from_code(code)
|
|
512
|
+
|
|
513
|
+
def detect_from_code(self, code: str) -> tuple[str, float]:
|
|
514
|
+
"""
|
|
515
|
+
Detect language from code content only (for PDFs, GitHub files).
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
code: Code content string
|
|
519
|
+
|
|
520
|
+
Returns:
|
|
521
|
+
Tuple of (language, confidence) where confidence is 0.0-1.0
|
|
522
|
+
"""
|
|
523
|
+
# Edge case: code too short
|
|
524
|
+
if len(code.strip()) < 10:
|
|
525
|
+
return "unknown", 0.0
|
|
526
|
+
|
|
527
|
+
# Calculate confidence scores for all languages
|
|
528
|
+
scores = self._calculate_confidence(code)
|
|
529
|
+
|
|
530
|
+
if not scores:
|
|
531
|
+
return "unknown", 0.0
|
|
532
|
+
|
|
533
|
+
# Get language with highest score
|
|
534
|
+
best_lang = max(scores.items(), key=lambda x: x[1])
|
|
535
|
+
lang, confidence = best_lang
|
|
536
|
+
|
|
537
|
+
# Apply minimum confidence threshold
|
|
538
|
+
if confidence < self.min_confidence:
|
|
539
|
+
return "unknown", 0.0
|
|
540
|
+
|
|
541
|
+
return lang, confidence
|
|
542
|
+
|
|
543
|
+
def extract_language_from_classes(self, classes: list[str]) -> str | None:
|
|
544
|
+
"""
|
|
545
|
+
Extract language from CSS class list.
|
|
546
|
+
|
|
547
|
+
Supports patterns:
|
|
548
|
+
- language-* (e.g., language-python)
|
|
549
|
+
- lang-* (e.g., lang-javascript)
|
|
550
|
+
- brush: * (e.g., brush: java)
|
|
551
|
+
- Bare names (e.g., python, java)
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
classes: List of CSS class names
|
|
555
|
+
|
|
556
|
+
Returns:
|
|
557
|
+
Language string or None if not found
|
|
558
|
+
"""
|
|
559
|
+
if not classes:
|
|
560
|
+
return None
|
|
561
|
+
|
|
562
|
+
for cls in classes:
|
|
563
|
+
# Handle brush: pattern
|
|
564
|
+
if "brush:" in cls:
|
|
565
|
+
parts = cls.split("brush:")
|
|
566
|
+
if len(parts) > 1:
|
|
567
|
+
lang = parts[1].strip().lower()
|
|
568
|
+
if lang in KNOWN_LANGUAGES:
|
|
569
|
+
return lang
|
|
570
|
+
|
|
571
|
+
# Handle language- prefix
|
|
572
|
+
if cls.startswith("language-"):
|
|
573
|
+
lang = cls[9:].lower()
|
|
574
|
+
if lang in KNOWN_LANGUAGES:
|
|
575
|
+
return lang
|
|
576
|
+
|
|
577
|
+
# Handle lang- prefix
|
|
578
|
+
if cls.startswith("lang-"):
|
|
579
|
+
lang = cls[5:].lower()
|
|
580
|
+
if lang in KNOWN_LANGUAGES:
|
|
581
|
+
return lang
|
|
582
|
+
|
|
583
|
+
# Handle bare class name
|
|
584
|
+
if cls.lower() in KNOWN_LANGUAGES:
|
|
585
|
+
return cls.lower()
|
|
586
|
+
|
|
587
|
+
return None
|
|
588
|
+
|
|
589
|
+
def _calculate_confidence(self, code: str) -> dict[str, float]:
|
|
590
|
+
"""
|
|
591
|
+
Calculate weighted confidence scores for all languages.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
code: Code content string
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
Dictionary mapping language names to confidence scores (0.0-1.0)
|
|
598
|
+
"""
|
|
599
|
+
scores: dict[str, float] = {}
|
|
600
|
+
|
|
601
|
+
for lang, compiled_patterns in self._pattern_cache.items():
|
|
602
|
+
total_score = 0
|
|
603
|
+
|
|
604
|
+
for pattern, weight in compiled_patterns:
|
|
605
|
+
if pattern.search(code):
|
|
606
|
+
total_score += weight
|
|
607
|
+
|
|
608
|
+
if total_score > 0:
|
|
609
|
+
# Normalize score to 0-1 range
|
|
610
|
+
# Score of 10+ = 1.0 confidence
|
|
611
|
+
confidence = min(total_score / 10.0, 1.0)
|
|
612
|
+
scores[lang] = confidence
|
|
613
|
+
|
|
614
|
+
return scores
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# ABOUTME: Detects and validates llms.txt file availability at documentation URLs
|
|
2
|
+
# ABOUTME: Supports llms-full.txt, llms.txt, and llms-small.txt variants
|
|
3
|
+
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LlmsTxtDetector:
|
|
10
|
+
"""Detect llms.txt files at documentation URLs"""
|
|
11
|
+
|
|
12
|
+
VARIANTS = [("llms-full.txt", "full"), ("llms.txt", "standard"), ("llms-small.txt", "small")]
|
|
13
|
+
|
|
14
|
+
def __init__(self, base_url: str):
|
|
15
|
+
self.base_url = base_url.rstrip("/")
|
|
16
|
+
|
|
17
|
+
def detect(self) -> dict[str, str] | None:
|
|
18
|
+
"""
|
|
19
|
+
Detect available llms.txt variant.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Dict with 'url' and 'variant' keys, or None if not found
|
|
23
|
+
"""
|
|
24
|
+
parsed = urlparse(self.base_url)
|
|
25
|
+
root_url = f"{parsed.scheme}://{parsed.netloc}"
|
|
26
|
+
|
|
27
|
+
for filename, variant in self.VARIANTS:
|
|
28
|
+
url = f"{root_url}/{filename}"
|
|
29
|
+
|
|
30
|
+
if self._check_url_exists(url):
|
|
31
|
+
return {"url": url, "variant": variant}
|
|
32
|
+
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
def detect_all(self) -> list[dict[str, str]]:
|
|
36
|
+
"""
|
|
37
|
+
Detect all available llms.txt variants.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of dicts with 'url' and 'variant' keys for each found variant
|
|
41
|
+
"""
|
|
42
|
+
found_variants = []
|
|
43
|
+
|
|
44
|
+
for filename, variant in self.VARIANTS:
|
|
45
|
+
parsed = urlparse(self.base_url)
|
|
46
|
+
root_url = f"{parsed.scheme}://{parsed.netloc}"
|
|
47
|
+
url = f"{root_url}/{filename}"
|
|
48
|
+
|
|
49
|
+
if self._check_url_exists(url):
|
|
50
|
+
found_variants.append({"url": url, "variant": variant})
|
|
51
|
+
|
|
52
|
+
return found_variants
|
|
53
|
+
|
|
54
|
+
def _check_url_exists(self, url: str) -> bool:
|
|
55
|
+
"""Check if URL returns 200 status"""
|
|
56
|
+
try:
|
|
57
|
+
response = requests.head(url, timeout=5, allow_redirects=True)
|
|
58
|
+
return response.status_code == 200
|
|
59
|
+
except requests.RequestException:
|
|
60
|
+
return False
|