skill-seekers 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skill_seekers/__init__.py +22 -0
  2. skill_seekers/cli/__init__.py +39 -0
  3. skill_seekers/cli/adaptors/__init__.py +120 -0
  4. skill_seekers/cli/adaptors/base.py +221 -0
  5. skill_seekers/cli/adaptors/claude.py +485 -0
  6. skill_seekers/cli/adaptors/gemini.py +453 -0
  7. skill_seekers/cli/adaptors/markdown.py +269 -0
  8. skill_seekers/cli/adaptors/openai.py +503 -0
  9. skill_seekers/cli/ai_enhancer.py +310 -0
  10. skill_seekers/cli/api_reference_builder.py +373 -0
  11. skill_seekers/cli/architectural_pattern_detector.py +525 -0
  12. skill_seekers/cli/code_analyzer.py +1462 -0
  13. skill_seekers/cli/codebase_scraper.py +1225 -0
  14. skill_seekers/cli/config_command.py +563 -0
  15. skill_seekers/cli/config_enhancer.py +431 -0
  16. skill_seekers/cli/config_extractor.py +871 -0
  17. skill_seekers/cli/config_manager.py +452 -0
  18. skill_seekers/cli/config_validator.py +394 -0
  19. skill_seekers/cli/conflict_detector.py +528 -0
  20. skill_seekers/cli/constants.py +72 -0
  21. skill_seekers/cli/dependency_analyzer.py +757 -0
  22. skill_seekers/cli/doc_scraper.py +2332 -0
  23. skill_seekers/cli/enhance_skill.py +488 -0
  24. skill_seekers/cli/enhance_skill_local.py +1096 -0
  25. skill_seekers/cli/enhance_status.py +194 -0
  26. skill_seekers/cli/estimate_pages.py +433 -0
  27. skill_seekers/cli/generate_router.py +1209 -0
  28. skill_seekers/cli/github_fetcher.py +534 -0
  29. skill_seekers/cli/github_scraper.py +1466 -0
  30. skill_seekers/cli/guide_enhancer.py +723 -0
  31. skill_seekers/cli/how_to_guide_builder.py +1267 -0
  32. skill_seekers/cli/install_agent.py +461 -0
  33. skill_seekers/cli/install_skill.py +178 -0
  34. skill_seekers/cli/language_detector.py +614 -0
  35. skill_seekers/cli/llms_txt_detector.py +60 -0
  36. skill_seekers/cli/llms_txt_downloader.py +104 -0
  37. skill_seekers/cli/llms_txt_parser.py +150 -0
  38. skill_seekers/cli/main.py +558 -0
  39. skill_seekers/cli/markdown_cleaner.py +132 -0
  40. skill_seekers/cli/merge_sources.py +806 -0
  41. skill_seekers/cli/package_multi.py +77 -0
  42. skill_seekers/cli/package_skill.py +241 -0
  43. skill_seekers/cli/pattern_recognizer.py +1825 -0
  44. skill_seekers/cli/pdf_extractor_poc.py +1166 -0
  45. skill_seekers/cli/pdf_scraper.py +617 -0
  46. skill_seekers/cli/quality_checker.py +519 -0
  47. skill_seekers/cli/rate_limit_handler.py +438 -0
  48. skill_seekers/cli/resume_command.py +160 -0
  49. skill_seekers/cli/run_tests.py +230 -0
  50. skill_seekers/cli/setup_wizard.py +93 -0
  51. skill_seekers/cli/split_config.py +390 -0
  52. skill_seekers/cli/swift_patterns.py +560 -0
  53. skill_seekers/cli/test_example_extractor.py +1081 -0
  54. skill_seekers/cli/test_unified_simple.py +179 -0
  55. skill_seekers/cli/unified_codebase_analyzer.py +572 -0
  56. skill_seekers/cli/unified_scraper.py +932 -0
  57. skill_seekers/cli/unified_skill_builder.py +1605 -0
  58. skill_seekers/cli/upload_skill.py +162 -0
  59. skill_seekers/cli/utils.py +432 -0
  60. skill_seekers/mcp/__init__.py +33 -0
  61. skill_seekers/mcp/agent_detector.py +316 -0
  62. skill_seekers/mcp/git_repo.py +273 -0
  63. skill_seekers/mcp/server.py +231 -0
  64. skill_seekers/mcp/server_fastmcp.py +1249 -0
  65. skill_seekers/mcp/server_legacy.py +2302 -0
  66. skill_seekers/mcp/source_manager.py +285 -0
  67. skill_seekers/mcp/tools/__init__.py +115 -0
  68. skill_seekers/mcp/tools/config_tools.py +251 -0
  69. skill_seekers/mcp/tools/packaging_tools.py +826 -0
  70. skill_seekers/mcp/tools/scraping_tools.py +842 -0
  71. skill_seekers/mcp/tools/source_tools.py +828 -0
  72. skill_seekers/mcp/tools/splitting_tools.py +212 -0
  73. skill_seekers/py.typed +0 -0
  74. skill_seekers-2.7.3.dist-info/METADATA +2027 -0
  75. skill_seekers-2.7.3.dist-info/RECORD +79 -0
  76. skill_seekers-2.7.3.dist-info/WHEEL +5 -0
  77. skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
  78. skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
  79. skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,614 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unified Language Detection for Code Blocks
4
+
5
+ Provides confidence-based language detection for documentation scrapers.
6
+ Supports 20+ programming languages with weighted pattern matching.
7
+
8
+ Author: Skill Seekers Project
9
+ """
10
+
11
+ import logging
12
+ import re
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Import Swift patterns from separate module (fork-friendly architecture)
17
+ try:
18
+ from skill_seekers.cli.swift_patterns import SWIFT_PATTERNS
19
+ except ImportError as e:
20
+ logger.warning(
21
+ "Swift language detection patterns unavailable. Swift code detection will be disabled. Error: %s",
22
+ e,
23
+ )
24
+ SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {}
25
+ except Exception as e:
26
+ logger.error(
27
+ "Failed to load Swift patterns due to unexpected error: %s. Swift detection disabled.", e
28
+ )
29
+ SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {}
30
+
31
+ # Verify Swift patterns were loaded correctly
32
+ if not SWIFT_PATTERNS:
33
+ logger.warning(
34
+ "Swift pattern dictionary is empty. Swift detection is disabled. "
35
+ "This may indicate swift_patterns.py has no patterns defined."
36
+ )
37
+ elif "swift" not in SWIFT_PATTERNS:
38
+ logger.error(
39
+ "Swift patterns loaded but 'swift' key is missing. Swift detection is broken. Please file a bug report."
40
+ )
41
+ else:
42
+ logger.info(
43
+ "Swift patterns loaded successfully: %d patterns for language detection",
44
+ len(SWIFT_PATTERNS.get("swift", [])),
45
+ )
46
+
47
+ # Comprehensive language patterns with weighted confidence scoring
48
+ # Weight 5: Unique identifiers (highly specific)
49
+ # Weight 4: Strong indicators
50
+ # Weight 3: Common patterns
51
+ # Weight 2: Moderate indicators
52
+ # Weight 1: Weak indicators
53
+
54
+ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = {
55
+ # ===== PRIORITY 1: Unity C# (Critical - User's Primary Issue) =====
56
+ "csharp": [
57
+ # Unity-specific patterns (weight 4-5, CRITICAL)
58
+ (r"\busing\s+UnityEngine", 5),
59
+ (r"\bMonoBehaviour\b", 5),
60
+ (r"\bGameObject\b", 4),
61
+ (r"\bTransform\b", 4),
62
+ (r"\bVector[23]\b", 3),
63
+ (r"\bQuaternion\b", 3),
64
+ (r"\bvoid\s+Start\s*\(\)", 4),
65
+ (r"\bvoid\s+Update\s*\(\)", 4),
66
+ (r"\bvoid\s+Awake\s*\(\)", 4),
67
+ (r"\bvoid\s+OnEnable\s*\(\)", 3),
68
+ (r"\bvoid\s+OnDisable\s*\(\)", 3),
69
+ (r"\bvoid\s+FixedUpdate\s*\(\)", 4),
70
+ (r"\bvoid\s+LateUpdate\s*\(\)", 4),
71
+ (r"\bvoid\s+OnCollisionEnter", 4),
72
+ (r"\bvoid\s+OnTriggerEnter", 4),
73
+ (r"\bIEnumerator\b", 4),
74
+ (r"\bStartCoroutine\s*\(", 4),
75
+ (r"\byield\s+return\s+new\s+WaitForSeconds", 4),
76
+ (r"\byield\s+return\s+null", 3),
77
+ (r"\byield\s+return", 4),
78
+ (r"\[SerializeField\]", 4),
79
+ (r"\[RequireComponent", 4),
80
+ (r"\[Header\(", 3),
81
+ (r"\[Range\(", 3),
82
+ (r"\bTime\.deltaTime\b", 4),
83
+ (r"\bInput\.Get", 4),
84
+ (r"\bRigidbody\b", 3),
85
+ (r"\bCollider\b", 3),
86
+ (r"\bRenderer\b", 3),
87
+ (r"\bGetComponent<", 3),
88
+ # Basic C# patterns (weight 2-4)
89
+ (r"\bnamespace\s+\w+", 3),
90
+ (r"\busing\s+System", 3),
91
+ (r"\bConsole\.WriteLine", 4), # C#-specific output
92
+ (r"\bConsole\.Write", 3),
93
+ (r"\bpublic\s+class\s+\w+", 4), # Increased to match Java weight
94
+ (r"\bprivate\s+class\s+\w+", 3),
95
+ (r"\binternal\s+class\s+\w+", 4), # C#-specific modifier
96
+ (r"\bstring\s+\w+\s*[;=]", 2), # C#-specific lowercase string
97
+ (r"\bprivate\s+\w+\s+\w+\s*;", 2), # Private fields (common in both C# and Java)
98
+ (r"\{\s*get;\s*set;\s*\}", 3), # Auto properties
99
+ (r"\{\s*get;\s*private\s+set;\s*\}", 3),
100
+ (r"\{\s*get\s*=>\s*", 2), # Expression properties
101
+ (r"\bpublic\s+static\s+void\s+", 2),
102
+ # Modern C# patterns (weight 2)
103
+ (r"\bfrom\s+\w+\s+in\s+", 2), # LINQ
104
+ (r"\.Where\s*\(", 2),
105
+ (r"\.Select\s*\(", 2),
106
+ (r"\basync\s+Task", 2),
107
+ (r"\bawait\s+", 2),
108
+ (r"\bvar\s+\w+\s*=", 1),
109
+ ],
110
+ # ===== PRIORITY 2: Frontend Languages =====
111
+ "typescript": [
112
+ # TypeScript-specific (weight 4-5)
113
+ (r"\binterface\s+\w+\s*\{", 5),
114
+ (r"\btype\s+\w+\s*=", 4),
115
+ (r":\s*\w+\s*=", 3), # Type annotation
116
+ (r":\s*\w+\[\]", 3), # Array type
117
+ (r"<[\w,\s]+>", 2), # Generic type
118
+ (r"\bas\s+\w+", 2), # Type assertion
119
+ (r"\benum\s+\w+\s*\{", 4),
120
+ (r"\bimplements\s+\w+", 3),
121
+ (r"\bexport\s+interface", 4),
122
+ (r"\bexport\s+type", 4),
123
+ # Also has JS patterns (weight 1)
124
+ (r"\bconst\s+\w+\s*=", 1),
125
+ (r"\blet\s+\w+\s*=", 1),
126
+ (r"=>", 1),
127
+ ],
128
+ "javascript": [
129
+ (r"\bfunction\s+\w+\s*\(", 3),
130
+ (r"\bconst\s+\w+\s*=", 2),
131
+ (r"\blet\s+\w+\s*=", 2),
132
+ (r"=>", 2), # Arrow function
133
+ (r"\bconsole\.log", 2),
134
+ (r"\bvar\s+\w+\s*=", 1),
135
+ (r"\.then\s*\(", 2), # Promise
136
+ (r"\.catch\s*\(", 2), # Promise
137
+ (r"\basync\s+function", 3),
138
+ (r"\bawait\s+", 2),
139
+ (r"require\s*\(", 2), # CommonJS
140
+ (r"\bexport\s+default", 2), # ES6
141
+ (r"\bexport\s+const", 2),
142
+ ],
143
+ "jsx": [
144
+ # JSX patterns (weight 4-5)
145
+ (r"<\w+\s+[^>]*>", 4), # JSX tag with attributes
146
+ (r"<\w+\s*/>", 4), # Self-closing tag
147
+ (r"className=", 3), # React className
148
+ (r"onClick=", 3), # React event
149
+ (r"\brender\s*\(\s*\)\s*\{", 4), # React render
150
+ (r"\buseState\s*\(", 4), # React hook
151
+ (r"\buseEffect\s*\(", 4), # React hook
152
+ (r"\buseRef\s*\(", 3),
153
+ (r"\buseCallback\s*\(", 3),
154
+ (r"\buseMemo\s*\(", 3),
155
+ # Also has JS patterns
156
+ (r"\bconst\s+\w+\s*=", 1),
157
+ (r"=>", 1),
158
+ ],
159
+ "tsx": [
160
+ # TSX = TypeScript + JSX (weight 5)
161
+ (r"<\w+\s+[^>]*>", 3), # JSX tag
162
+ (r":\s*React\.\w+", 5), # React types
163
+ (r"interface\s+\w+Props", 5), # Props interface
164
+ (r"\bFunctionComponent<", 4),
165
+ (r"\bReact\.FC<", 4),
166
+ (r"\buseState<", 4), # Typed hook
167
+ (r"\buseRef<", 3),
168
+ # Also has TS patterns
169
+ (r"\binterface\s+\w+", 2),
170
+ (r"\btype\s+\w+\s*=", 2),
171
+ ],
172
+ "vue": [
173
+ # Vue SFC patterns (weight 4-5)
174
+ (r"<template>", 5),
175
+ (r"<script>", 3),
176
+ (r"<style\s+scoped>", 4),
177
+ (r"\bexport\s+default\s*\{", 3),
178
+ (r"\bdata\s*\(\s*\)\s*\{", 4), # Vue 2
179
+ (r"\bcomputed\s*:", 3),
180
+ (r"\bmethods\s*:", 3),
181
+ (r"\bsetup\s*\(", 4), # Vue 3 Composition
182
+ (r"\bref\s*\(", 4), # Vue 3
183
+ (r"\breactive\s*\(", 4), # Vue 3
184
+ (r"v-bind:", 3),
185
+ (r"v-for=", 3),
186
+ (r"v-if=", 3),
187
+ (r"v-model=", 3),
188
+ ],
189
+ # ===== PRIORITY 3: Backend Languages =====
190
+ "java": [
191
+ (r"\bpublic\s+class\s+\w+", 4),
192
+ (r"\bprivate\s+\w+\s+\w+", 2),
193
+ (r"\bSystem\.out\.println", 3),
194
+ (r"\bpublic\s+static\s+void\s+main", 4),
195
+ (r"\bpublic\s+\w+\s+\w+\s*\(", 2),
196
+ (r"@Override", 3),
197
+ (r"@Autowired", 3), # Spring
198
+ (r"@Service", 3), # Spring
199
+ (r"@RestController", 3), # Spring
200
+ (r"@GetMapping", 3), # Spring
201
+ (r"@PostMapping", 3), # Spring
202
+ (r"\bimport\s+java\.", 2),
203
+ (r"\bextends\s+\w+", 2),
204
+ ],
205
+ "go": [
206
+ (r"\bfunc\s+\w+\s*\(", 3),
207
+ (r"\bpackage\s+\w+", 4),
208
+ (r":=", 3), # Short declaration
209
+ (r"\bfmt\.Print", 2),
210
+ (r"\bfunc\s+\(.*\)\s+\w+\s*\(", 4), # Method
211
+ (r"\bdefer\s+", 3),
212
+ (r"\bgo\s+\w+\s*\(", 3), # Goroutine
213
+ (r"\bchan\s+", 3), # Channel
214
+ (r"\binterface\{\}", 2), # Empty interface
215
+ (r"\bfunc\s+main\s*\(\)", 4),
216
+ ],
217
+ "rust": [
218
+ (r"\bfn\s+\w+\s*\(", 4),
219
+ (r"\blet\s+mut\s+\w+", 3),
220
+ (r"\bprintln!", 3),
221
+ (r"\bimpl\s+\w+", 3),
222
+ (r"\buse\s+\w+::", 3),
223
+ (r"\bpub\s+fn\s+", 3),
224
+ (r"\bmatch\s+\w+\s*\{", 3),
225
+ (r"\bSome\(", 2),
226
+ (r"\bNone\b", 2),
227
+ (r"\bResult<", 3),
228
+ (r"\bOption<", 3),
229
+ (r"&str\b", 2),
230
+ (r"\bfn\s+main\s*\(\)", 4),
231
+ ],
232
+ "php": [
233
+ (r"<\?php", 5),
234
+ (r"\$\w+\s*=", 2),
235
+ (r"\bfunction\s+\w+\s*\(", 2),
236
+ (r"\bpublic\s+function", 3),
237
+ (r"\bprivate\s+function", 3),
238
+ (r"\bclass\s+\w+", 3),
239
+ (r"\bnamespace\s+\w+", 3),
240
+ (r"\buse\s+\w+\\", 2),
241
+ (r"->", 2), # Object operator
242
+ (r"::", 1), # Static operator
243
+ ],
244
+ # ===== PRIORITY 4: System/Data Languages =====
245
+ "python": [
246
+ (r"\bdef\s+\w+\s*\(", 3),
247
+ (r"\bimport\s+\w+", 2),
248
+ (r"\bclass\s+\w+:", 3),
249
+ (r"\bfrom\s+\w+\s+import", 2),
250
+ (r":\s*$", 1), # Lines ending with :
251
+ (r"@\w+", 2), # Decorator
252
+ (r"\bself\.\w+", 2),
253
+ (r"\b__init__\s*\(", 3),
254
+ (r"\basync\s+def\s+", 3),
255
+ (r"\bawait\s+", 2),
256
+ (r"\bprint\s*\(", 1),
257
+ ],
258
+ "r": [
259
+ (r"<-", 4), # Assignment operator
260
+ (r"\bfunction\s*\(", 2),
261
+ (r"\blibrary\s*\(", 3),
262
+ (r"\bggplot\s*\(", 4), # ggplot2
263
+ (r"\bdata\.frame\s*\(", 3),
264
+ (r"\%>\%", 4), # Pipe operator
265
+ (r"\bsummary\s*\(", 2),
266
+ (r"\bread\.csv\s*\(", 3),
267
+ ],
268
+ "julia": [
269
+ (r"\bfunction\s+\w+\s*\(", 3),
270
+ (r"\bend\b", 2),
271
+ (r"\busing\s+\w+", 3),
272
+ (r"::", 2), # Type annotation
273
+ (r"\bmodule\s+\w+", 3),
274
+ (r"\babstract\s+type", 3),
275
+ (r"\bstruct\s+\w+", 3),
276
+ ],
277
+ "sql": [
278
+ (r"\bSELECT\s+", 4),
279
+ (r"\bFROM\s+", 3),
280
+ (r"\bWHERE\s+", 2),
281
+ (r"\bINSERT\s+INTO", 4),
282
+ (r"\bCREATE\s+TABLE", 4),
283
+ (r"\bJOIN\s+", 3),
284
+ (r"\bGROUP\s+BY", 3),
285
+ (r"\bORDER\s+BY", 3),
286
+ (r"\bUPDATE\s+", 3),
287
+ (r"\bDELETE\s+FROM", 3),
288
+ ],
289
+ # ===== Additional Languages =====
290
+ "cpp": [
291
+ (r"#include\s*<", 4),
292
+ (r"\bstd::", 3),
293
+ (r"\bnamespace\s+\w+", 3),
294
+ (r"\bcout\s*<<", 3),
295
+ (r"\bvoid\s+\w+\s*\(", 2),
296
+ (r"\bint\s+main\s*\(", 4),
297
+ (r"->", 2), # Pointer
298
+ ],
299
+ "c": [
300
+ (r"#include\s*<", 4),
301
+ (r"\bprintf\s*\(", 3),
302
+ (r"\bint\s+main\s*\(", 4),
303
+ (r"\bvoid\s+\w+\s*\(", 2),
304
+ (r"\bstruct\s+\w+", 3),
305
+ ],
306
+ "gdscript": [
307
+ (r"\bfunc\s+\w+\s*\(", 3),
308
+ (r"\bvar\s+\w+\s*=", 3),
309
+ (r"\bextends\s+\w+", 4),
310
+ (r"\b_ready\s*\(", 4),
311
+ (r"\b_process\s*\(", 4),
312
+ ],
313
+ # ===== Markup/Config Languages =====
314
+ "html": [
315
+ (r"<!DOCTYPE\s+html>", 5),
316
+ (r"<html", 4),
317
+ (r"<head>", 3),
318
+ (r"<body>", 3),
319
+ (r"<div", 2),
320
+ (r"<span", 2),
321
+ (r"<script", 2),
322
+ ],
323
+ "css": [
324
+ (r"\{\s*[\w-]+\s*:", 3),
325
+ (r"@media", 3),
326
+ (r"\.[\w-]+\s*\{", 2),
327
+ (r"#[\w-]+\s*\{", 2),
328
+ (r"@import", 2),
329
+ ],
330
+ "json": [
331
+ (r"^\s*\{", 3),
332
+ (r"^\s*\[", 3),
333
+ (r'"\w+"\s*:', 3),
334
+ (r':\s*["\d\[\{]', 2),
335
+ ],
336
+ "yaml": [
337
+ (r"^\w+:", 3),
338
+ (r"^\s+-\s+\w+", 2),
339
+ (r"---", 2),
340
+ (r"^\s+\w+:", 2),
341
+ ],
342
+ "xml": [
343
+ (r"<\?xml", 5),
344
+ (r"<\w+\s+\w+=", 2),
345
+ (r"<\w+>", 1),
346
+ (r"</\w+>", 1),
347
+ ],
348
+ "markdown": [
349
+ (r"^#+\s+", 3),
350
+ (r"^\*\*\w+\*\*", 2),
351
+ (r"^\s*[-*]\s+", 2),
352
+ (r"\[.*\]\(.*\)", 2),
353
+ ],
354
+ "bash": [
355
+ (r"#!/bin/bash", 5),
356
+ (r"#!/bin/sh", 5),
357
+ (r"\becho\s+", 2),
358
+ (r"\$\{?\w+\}?", 2),
359
+ (r"\bif\s+\[", 2),
360
+ (r"\bfor\s+\w+\s+in", 2),
361
+ ],
362
+ "shell": [
363
+ (r"#!/bin/bash", 5),
364
+ (r"#!/bin/sh", 5),
365
+ (r"\becho\s+", 2),
366
+ (r"\$\{?\w+\}?", 2),
367
+ ],
368
+ "powershell": [
369
+ (r"\$\w+\s*=", 2),
370
+ (r"Get-\w+", 3),
371
+ (r"Set-\w+", 3),
372
+ (r"\bWrite-Host\s+", 2),
373
+ ],
374
+ }
375
+
376
+ # Merge Swift patterns (fork-friendly: patterns defined in swift_patterns.py)
377
+ LANGUAGE_PATTERNS.update(SWIFT_PATTERNS)
378
+
379
+
380
+ # Known language list for CSS class detection
381
+ KNOWN_LANGUAGES = [
382
+ "javascript",
383
+ "java",
384
+ "xml",
385
+ "html",
386
+ "python",
387
+ "bash",
388
+ "cpp",
389
+ "typescript",
390
+ "go",
391
+ "rust",
392
+ "php",
393
+ "ruby",
394
+ "swift",
395
+ "kotlin",
396
+ "csharp",
397
+ "c",
398
+ "sql",
399
+ "yaml",
400
+ "json",
401
+ "markdown",
402
+ "css",
403
+ "scss",
404
+ "sass",
405
+ "jsx",
406
+ "tsx",
407
+ "vue",
408
+ "shell",
409
+ "powershell",
410
+ "r",
411
+ "scala",
412
+ "dart",
413
+ "perl",
414
+ "lua",
415
+ "elixir",
416
+ "julia",
417
+ "gdscript",
418
+ ]
419
+
420
+
421
+ class LanguageDetector:
422
+ """
423
+ Unified confidence-based language detection for code blocks.
424
+
425
+ Supports 20+ programming languages with weighted pattern matching.
426
+ Uses two-stage detection:
427
+ 1. CSS class extraction (high confidence = 1.0)
428
+ 2. Pattern-based heuristics with confidence scoring (0.0-1.0)
429
+
430
+ Example:
431
+ detector = LanguageDetector(min_confidence=0.3)
432
+ lang, confidence = detector.detect_from_html(elem, code)
433
+
434
+ if confidence >= 0.7:
435
+ print(f"High confidence: {lang}")
436
+ elif confidence >= 0.5:
437
+ print(f"Medium confidence: {lang}")
438
+ else:
439
+ print(f"Low confidence: {lang}")
440
+ """
441
+
442
+ def __init__(self, min_confidence: float = 0.15):
443
+ """
444
+ Initialize language detector.
445
+
446
+ Args:
447
+ min_confidence: Minimum confidence threshold (0-1)
448
+ 0.3 = low, 0.5 = medium, 0.7 = high
449
+ """
450
+ self.min_confidence = min_confidence
451
+ self._pattern_cache: dict[str, list[tuple[re.Pattern, int]]] = {}
452
+ self._compile_patterns()
453
+
454
+ def _compile_patterns(self) -> None:
455
+ """Compile regex patterns and cache them for performance"""
456
+ for lang, patterns in LANGUAGE_PATTERNS.items():
457
+ compiled_patterns = []
458
+ for i, (pattern, weight) in enumerate(patterns):
459
+ try:
460
+ compiled = re.compile(pattern, re.IGNORECASE | re.MULTILINE)
461
+ compiled_patterns.append((compiled, weight))
462
+ except re.error as e:
463
+ logger.error(
464
+ "Invalid regex pattern for language '%s' at index %d: '%s'. Error: %s. Pattern skipped.",
465
+ lang,
466
+ i,
467
+ pattern[:50],
468
+ e,
469
+ )
470
+ except TypeError:
471
+ logger.error(
472
+ "Pattern for language '%s' at index %d is not a string: %s. Pattern skipped.",
473
+ lang,
474
+ i,
475
+ type(pattern).__name__,
476
+ )
477
+
478
+ if compiled_patterns:
479
+ self._pattern_cache[lang] = compiled_patterns
480
+ else:
481
+ logger.warning(
482
+ "No valid patterns compiled for language '%s'. Detection for this language is disabled.",
483
+ lang,
484
+ )
485
+
486
+ def detect_from_html(self, elem, code: str) -> tuple[str, float]:
487
+ """
488
+ Detect language from HTML element with CSS classes + code content.
489
+
490
+ Args:
491
+ elem: BeautifulSoup element with 'class' attribute
492
+ code: Code content string
493
+
494
+ Returns:
495
+ Tuple of (language, confidence) where confidence is 0.0-1.0
496
+ """
497
+ # Tier 1: CSS classes (confidence 1.0)
498
+ if elem:
499
+ css_lang = self.extract_language_from_classes(elem.get("class", []))
500
+ if css_lang:
501
+ return css_lang, 1.0
502
+
503
+ # Check parent pre element
504
+ parent = elem.parent
505
+ if parent and parent.name == "pre":
506
+ css_lang = self.extract_language_from_classes(parent.get("class", []))
507
+ if css_lang:
508
+ return css_lang, 1.0
509
+
510
+ # Tier 2: Pattern matching
511
+ return self.detect_from_code(code)
512
+
513
+ def detect_from_code(self, code: str) -> tuple[str, float]:
514
+ """
515
+ Detect language from code content only (for PDFs, GitHub files).
516
+
517
+ Args:
518
+ code: Code content string
519
+
520
+ Returns:
521
+ Tuple of (language, confidence) where confidence is 0.0-1.0
522
+ """
523
+ # Edge case: code too short
524
+ if len(code.strip()) < 10:
525
+ return "unknown", 0.0
526
+
527
+ # Calculate confidence scores for all languages
528
+ scores = self._calculate_confidence(code)
529
+
530
+ if not scores:
531
+ return "unknown", 0.0
532
+
533
+ # Get language with highest score
534
+ best_lang = max(scores.items(), key=lambda x: x[1])
535
+ lang, confidence = best_lang
536
+
537
+ # Apply minimum confidence threshold
538
+ if confidence < self.min_confidence:
539
+ return "unknown", 0.0
540
+
541
+ return lang, confidence
542
+
543
+ def extract_language_from_classes(self, classes: list[str]) -> str | None:
544
+ """
545
+ Extract language from CSS class list.
546
+
547
+ Supports patterns:
548
+ - language-* (e.g., language-python)
549
+ - lang-* (e.g., lang-javascript)
550
+ - brush: * (e.g., brush: java)
551
+ - Bare names (e.g., python, java)
552
+
553
+ Args:
554
+ classes: List of CSS class names
555
+
556
+ Returns:
557
+ Language string or None if not found
558
+ """
559
+ if not classes:
560
+ return None
561
+
562
+ for cls in classes:
563
+ # Handle brush: pattern
564
+ if "brush:" in cls:
565
+ parts = cls.split("brush:")
566
+ if len(parts) > 1:
567
+ lang = parts[1].strip().lower()
568
+ if lang in KNOWN_LANGUAGES:
569
+ return lang
570
+
571
+ # Handle language- prefix
572
+ if cls.startswith("language-"):
573
+ lang = cls[9:].lower()
574
+ if lang in KNOWN_LANGUAGES:
575
+ return lang
576
+
577
+ # Handle lang- prefix
578
+ if cls.startswith("lang-"):
579
+ lang = cls[5:].lower()
580
+ if lang in KNOWN_LANGUAGES:
581
+ return lang
582
+
583
+ # Handle bare class name
584
+ if cls.lower() in KNOWN_LANGUAGES:
585
+ return cls.lower()
586
+
587
+ return None
588
+
589
+ def _calculate_confidence(self, code: str) -> dict[str, float]:
590
+ """
591
+ Calculate weighted confidence scores for all languages.
592
+
593
+ Args:
594
+ code: Code content string
595
+
596
+ Returns:
597
+ Dictionary mapping language names to confidence scores (0.0-1.0)
598
+ """
599
+ scores: dict[str, float] = {}
600
+
601
+ for lang, compiled_patterns in self._pattern_cache.items():
602
+ total_score = 0
603
+
604
+ for pattern, weight in compiled_patterns:
605
+ if pattern.search(code):
606
+ total_score += weight
607
+
608
+ if total_score > 0:
609
+ # Normalize score to 0-1 range
610
+ # Score of 10+ = 1.0 confidence
611
+ confidence = min(total_score / 10.0, 1.0)
612
+ scores[lang] = confidence
613
+
614
+ return scores
@@ -0,0 +1,60 @@
1
+ # ABOUTME: Detects and validates llms.txt file availability at documentation URLs
2
+ # ABOUTME: Supports llms-full.txt, llms.txt, and llms-small.txt variants
3
+
4
+ from urllib.parse import urlparse
5
+
6
+ import requests
7
+
8
+
9
+ class LlmsTxtDetector:
10
+ """Detect llms.txt files at documentation URLs"""
11
+
12
+ VARIANTS = [("llms-full.txt", "full"), ("llms.txt", "standard"), ("llms-small.txt", "small")]
13
+
14
+ def __init__(self, base_url: str):
15
+ self.base_url = base_url.rstrip("/")
16
+
17
+ def detect(self) -> dict[str, str] | None:
18
+ """
19
+ Detect available llms.txt variant.
20
+
21
+ Returns:
22
+ Dict with 'url' and 'variant' keys, or None if not found
23
+ """
24
+ parsed = urlparse(self.base_url)
25
+ root_url = f"{parsed.scheme}://{parsed.netloc}"
26
+
27
+ for filename, variant in self.VARIANTS:
28
+ url = f"{root_url}/{filename}"
29
+
30
+ if self._check_url_exists(url):
31
+ return {"url": url, "variant": variant}
32
+
33
+ return None
34
+
35
+ def detect_all(self) -> list[dict[str, str]]:
36
+ """
37
+ Detect all available llms.txt variants.
38
+
39
+ Returns:
40
+ List of dicts with 'url' and 'variant' keys for each found variant
41
+ """
42
+ found_variants = []
43
+
44
+ for filename, variant in self.VARIANTS:
45
+ parsed = urlparse(self.base_url)
46
+ root_url = f"{parsed.scheme}://{parsed.netloc}"
47
+ url = f"{root_url}/{filename}"
48
+
49
+ if self._check_url_exists(url):
50
+ found_variants.append({"url": url, "variant": variant})
51
+
52
+ return found_variants
53
+
54
+ def _check_url_exists(self, url: str) -> bool:
55
+ """Check if URL returns 200 status"""
56
+ try:
57
+ response = requests.head(url, timeout=5, allow_redirects=True)
58
+ return response.status_code == 200
59
+ except requests.RequestException:
60
+ return False