unrealon 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. unrealon/__init__.py +23 -21
  2. unrealon-1.1.0.dist-info/METADATA +164 -0
  3. unrealon-1.1.0.dist-info/RECORD +82 -0
  4. {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
  5. unrealon-1.1.0.dist-info/entry_points.txt +9 -0
  6. {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
  7. unrealon_bridge/__init__.py +114 -0
  8. unrealon_bridge/cli.py +316 -0
  9. unrealon_bridge/client/__init__.py +93 -0
  10. unrealon_bridge/client/base.py +78 -0
  11. unrealon_bridge/client/commands.py +89 -0
  12. unrealon_bridge/client/connection.py +90 -0
  13. unrealon_bridge/client/events.py +65 -0
  14. unrealon_bridge/client/health.py +38 -0
  15. unrealon_bridge/client/html_parser.py +146 -0
  16. unrealon_bridge/client/logging.py +139 -0
  17. unrealon_bridge/client/proxy.py +70 -0
  18. unrealon_bridge/client/scheduler.py +450 -0
  19. unrealon_bridge/client/session.py +70 -0
  20. unrealon_bridge/configs/__init__.py +14 -0
  21. unrealon_bridge/configs/bridge_config.py +212 -0
  22. unrealon_bridge/configs/bridge_config.yaml +39 -0
  23. unrealon_bridge/models/__init__.py +138 -0
  24. unrealon_bridge/models/base.py +28 -0
  25. unrealon_bridge/models/command.py +41 -0
  26. unrealon_bridge/models/events.py +40 -0
  27. unrealon_bridge/models/html_parser.py +79 -0
  28. unrealon_bridge/models/logging.py +55 -0
  29. unrealon_bridge/models/parser.py +63 -0
  30. unrealon_bridge/models/proxy.py +41 -0
  31. unrealon_bridge/models/requests.py +95 -0
  32. unrealon_bridge/models/responses.py +88 -0
  33. unrealon_bridge/models/scheduler.py +592 -0
  34. unrealon_bridge/models/session.py +28 -0
  35. unrealon_bridge/server/__init__.py +91 -0
  36. unrealon_bridge/server/base.py +171 -0
  37. unrealon_bridge/server/handlers/__init__.py +23 -0
  38. unrealon_bridge/server/handlers/command.py +110 -0
  39. unrealon_bridge/server/handlers/html_parser.py +139 -0
  40. unrealon_bridge/server/handlers/logging.py +95 -0
  41. unrealon_bridge/server/handlers/parser.py +95 -0
  42. unrealon_bridge/server/handlers/proxy.py +75 -0
  43. unrealon_bridge/server/handlers/scheduler.py +545 -0
  44. unrealon_bridge/server/handlers/session.py +66 -0
  45. unrealon_browser/__init__.py +61 -18
  46. unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
  47. unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
  48. unrealon_browser/{src/core → core}/browser_manager.py +2 -2
  49. unrealon_browser/{src/managers → managers}/captcha.py +1 -1
  50. unrealon_browser/{src/managers → managers}/cookies.py +1 -1
  51. unrealon_browser/managers/logger_bridge.py +231 -0
  52. unrealon_browser/{src/managers → managers}/profile.py +1 -1
  53. unrealon_driver/__init__.py +73 -19
  54. unrealon_driver/browser/__init__.py +8 -0
  55. unrealon_driver/browser/config.py +74 -0
  56. unrealon_driver/browser/manager.py +416 -0
  57. unrealon_driver/exceptions.py +28 -0
  58. unrealon_driver/parser/__init__.py +55 -0
  59. unrealon_driver/parser/cli_manager.py +141 -0
  60. unrealon_driver/parser/daemon_manager.py +227 -0
  61. unrealon_driver/parser/managers/__init__.py +46 -0
  62. unrealon_driver/parser/managers/browser.py +51 -0
  63. unrealon_driver/parser/managers/config.py +281 -0
  64. unrealon_driver/parser/managers/error.py +412 -0
  65. unrealon_driver/parser/managers/html.py +732 -0
  66. unrealon_driver/parser/managers/logging.py +609 -0
  67. unrealon_driver/parser/managers/result.py +321 -0
  68. unrealon_driver/parser/parser_manager.py +628 -0
  69. unrealon/sdk_config.py +0 -88
  70. unrealon-1.0.9.dist-info/METADATA +0 -810
  71. unrealon-1.0.9.dist-info/RECORD +0 -246
  72. unrealon_browser/pyproject.toml +0 -182
  73. unrealon_browser/src/__init__.py +0 -62
  74. unrealon_browser/src/managers/logger_bridge.py +0 -395
  75. unrealon_driver/README.md +0 -204
  76. unrealon_driver/pyproject.toml +0 -187
  77. unrealon_driver/src/__init__.py +0 -90
  78. unrealon_driver/src/cli/__init__.py +0 -10
  79. unrealon_driver/src/cli/main.py +0 -66
  80. unrealon_driver/src/cli/simple.py +0 -510
  81. unrealon_driver/src/config/__init__.py +0 -11
  82. unrealon_driver/src/config/auto_config.py +0 -478
  83. unrealon_driver/src/core/__init__.py +0 -18
  84. unrealon_driver/src/core/exceptions.py +0 -289
  85. unrealon_driver/src/core/parser.py +0 -638
  86. unrealon_driver/src/dto/__init__.py +0 -66
  87. unrealon_driver/src/dto/cli.py +0 -119
  88. unrealon_driver/src/dto/config.py +0 -18
  89. unrealon_driver/src/dto/events.py +0 -237
  90. unrealon_driver/src/dto/execution.py +0 -313
  91. unrealon_driver/src/dto/services.py +0 -311
  92. unrealon_driver/src/execution/__init__.py +0 -23
  93. unrealon_driver/src/execution/daemon_mode.py +0 -317
  94. unrealon_driver/src/execution/interactive_mode.py +0 -88
  95. unrealon_driver/src/execution/modes.py +0 -45
  96. unrealon_driver/src/execution/scheduled_mode.py +0 -209
  97. unrealon_driver/src/execution/test_mode.py +0 -250
  98. unrealon_driver/src/logging/__init__.py +0 -24
  99. unrealon_driver/src/logging/driver_logger.py +0 -512
  100. unrealon_driver/src/services/__init__.py +0 -24
  101. unrealon_driver/src/services/browser_service.py +0 -726
  102. unrealon_driver/src/services/llm/__init__.py +0 -15
  103. unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
  104. unrealon_driver/src/services/llm/llm.py +0 -195
  105. unrealon_driver/src/services/logger_service.py +0 -232
  106. unrealon_driver/src/services/metrics_service.py +0 -185
  107. unrealon_driver/src/services/scheduler_service.py +0 -489
  108. unrealon_driver/src/services/websocket_service.py +0 -362
  109. unrealon_driver/src/utils/__init__.py +0 -16
  110. unrealon_driver/src/utils/service_factory.py +0 -317
  111. unrealon_driver/src/utils/time_formatter.py +0 -338
  112. unrealon_llm/README.md +0 -44
  113. unrealon_llm/__init__.py +0 -26
  114. unrealon_llm/pyproject.toml +0 -154
  115. unrealon_llm/src/__init__.py +0 -228
  116. unrealon_llm/src/cli/__init__.py +0 -0
  117. unrealon_llm/src/core/__init__.py +0 -11
  118. unrealon_llm/src/core/smart_client.py +0 -438
  119. unrealon_llm/src/dto/__init__.py +0 -155
  120. unrealon_llm/src/dto/models/__init__.py +0 -0
  121. unrealon_llm/src/dto/models/config.py +0 -343
  122. unrealon_llm/src/dto/models/core.py +0 -328
  123. unrealon_llm/src/dto/models/enums.py +0 -123
  124. unrealon_llm/src/dto/models/html_analysis.py +0 -345
  125. unrealon_llm/src/dto/models/statistics.py +0 -473
  126. unrealon_llm/src/dto/models/translation.py +0 -383
  127. unrealon_llm/src/dto/models/type_conversion.py +0 -462
  128. unrealon_llm/src/dto/schemas/__init__.py +0 -0
  129. unrealon_llm/src/exceptions.py +0 -392
  130. unrealon_llm/src/llm_config/__init__.py +0 -20
  131. unrealon_llm/src/llm_config/logging_config.py +0 -178
  132. unrealon_llm/src/llm_logging/__init__.py +0 -42
  133. unrealon_llm/src/llm_logging/llm_events.py +0 -107
  134. unrealon_llm/src/llm_logging/llm_logger.py +0 -466
  135. unrealon_llm/src/managers/__init__.py +0 -15
  136. unrealon_llm/src/managers/cache_manager.py +0 -67
  137. unrealon_llm/src/managers/cost_manager.py +0 -107
  138. unrealon_llm/src/managers/request_manager.py +0 -298
  139. unrealon_llm/src/modules/__init__.py +0 -0
  140. unrealon_llm/src/modules/html_processor/__init__.py +0 -25
  141. unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
  142. unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
  143. unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
  144. unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
  145. unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
  146. unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
  147. unrealon_llm/src/modules/html_processor/processor.py +0 -102
  148. unrealon_llm/src/modules/llm/__init__.py +0 -0
  149. unrealon_llm/src/modules/translator/__init__.py +0 -0
  150. unrealon_llm/src/provider.py +0 -116
  151. unrealon_llm/src/utils/__init__.py +0 -95
  152. unrealon_llm/src/utils/common.py +0 -64
  153. unrealon_llm/src/utils/data_extractor.py +0 -188
  154. unrealon_llm/src/utils/html_cleaner.py +0 -767
  155. unrealon_llm/src/utils/language_detector.py +0 -308
  156. unrealon_llm/src/utils/models_cache.py +0 -592
  157. unrealon_llm/src/utils/smart_counter.py +0 -229
  158. unrealon_llm/src/utils/token_counter.py +0 -189
  159. unrealon_sdk/README.md +0 -25
  160. unrealon_sdk/__init__.py +0 -30
  161. unrealon_sdk/pyproject.toml +0 -231
  162. unrealon_sdk/src/__init__.py +0 -150
  163. unrealon_sdk/src/cli/__init__.py +0 -12
  164. unrealon_sdk/src/cli/commands/__init__.py +0 -22
  165. unrealon_sdk/src/cli/commands/benchmark.py +0 -42
  166. unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
  167. unrealon_sdk/src/cli/commands/health.py +0 -46
  168. unrealon_sdk/src/cli/commands/integration.py +0 -498
  169. unrealon_sdk/src/cli/commands/reports.py +0 -43
  170. unrealon_sdk/src/cli/commands/security.py +0 -36
  171. unrealon_sdk/src/cli/commands/server.py +0 -483
  172. unrealon_sdk/src/cli/commands/servers.py +0 -56
  173. unrealon_sdk/src/cli/commands/tests.py +0 -55
  174. unrealon_sdk/src/cli/main.py +0 -126
  175. unrealon_sdk/src/cli/utils/reporter.py +0 -519
  176. unrealon_sdk/src/clients/openapi.yaml +0 -3347
  177. unrealon_sdk/src/clients/python_http/__init__.py +0 -3
  178. unrealon_sdk/src/clients/python_http/api_config.py +0 -228
  179. unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
  180. unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
  181. unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
  182. unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
  183. unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
  184. unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
  185. unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
  186. unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
  187. unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
  188. unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
  189. unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
  190. unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
  191. unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
  192. unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
  193. unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
  194. unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
  195. unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
  196. unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
  197. unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
  198. unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
  199. unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
  200. unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
  201. unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
  202. unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
  203. unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
  204. unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
  205. unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
  206. unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
  207. unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
  208. unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
  209. unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
  210. unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
  211. unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
  212. unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
  213. unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
  214. unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
  215. unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
  216. unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
  217. unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
  218. unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
  219. unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
  220. unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
  221. unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
  222. unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
  223. unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
  224. unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
  225. unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
  226. unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
  227. unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
  228. unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
  229. unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
  230. unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
  231. unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
  232. unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
  233. unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
  234. unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
  235. unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
  236. unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
  237. unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
  238. unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
  239. unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
  240. unrealon_sdk/src/clients/python_websocket/client.py +0 -490
  241. unrealon_sdk/src/clients/python_websocket/events.py +0 -732
  242. unrealon_sdk/src/clients/python_websocket/example.py +0 -136
  243. unrealon_sdk/src/clients/python_websocket/types.py +0 -871
  244. unrealon_sdk/src/core/__init__.py +0 -64
  245. unrealon_sdk/src/core/client.py +0 -556
  246. unrealon_sdk/src/core/config.py +0 -465
  247. unrealon_sdk/src/core/exceptions.py +0 -239
  248. unrealon_sdk/src/core/metadata.py +0 -191
  249. unrealon_sdk/src/core/models.py +0 -142
  250. unrealon_sdk/src/core/types.py +0 -68
  251. unrealon_sdk/src/dto/__init__.py +0 -268
  252. unrealon_sdk/src/dto/authentication.py +0 -108
  253. unrealon_sdk/src/dto/cache.py +0 -208
  254. unrealon_sdk/src/dto/common.py +0 -19
  255. unrealon_sdk/src/dto/concurrency.py +0 -393
  256. unrealon_sdk/src/dto/events.py +0 -108
  257. unrealon_sdk/src/dto/health.py +0 -339
  258. unrealon_sdk/src/dto/load_balancing.py +0 -336
  259. unrealon_sdk/src/dto/logging.py +0 -230
  260. unrealon_sdk/src/dto/performance.py +0 -165
  261. unrealon_sdk/src/dto/rate_limiting.py +0 -295
  262. unrealon_sdk/src/dto/resource_pooling.py +0 -128
  263. unrealon_sdk/src/dto/structured_logging.py +0 -112
  264. unrealon_sdk/src/dto/task_scheduling.py +0 -121
  265. unrealon_sdk/src/dto/websocket.py +0 -55
  266. unrealon_sdk/src/enterprise/__init__.py +0 -59
  267. unrealon_sdk/src/enterprise/authentication.py +0 -401
  268. unrealon_sdk/src/enterprise/cache_manager.py +0 -578
  269. unrealon_sdk/src/enterprise/error_recovery.py +0 -494
  270. unrealon_sdk/src/enterprise/event_system.py +0 -549
  271. unrealon_sdk/src/enterprise/health_monitor.py +0 -747
  272. unrealon_sdk/src/enterprise/load_balancer.py +0 -964
  273. unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
  274. unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
  275. unrealon_sdk/src/enterprise/logging/development.py +0 -744
  276. unrealon_sdk/src/enterprise/logging/service.py +0 -410
  277. unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
  278. unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
  279. unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
  280. unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
  281. unrealon_sdk/src/enterprise/resource_pool.py +0 -763
  282. unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
  283. unrealon_sdk/src/internal/__init__.py +0 -10
  284. unrealon_sdk/src/internal/command_router.py +0 -497
  285. unrealon_sdk/src/internal/connection_manager.py +0 -397
  286. unrealon_sdk/src/internal/http_client.py +0 -446
  287. unrealon_sdk/src/internal/websocket_client.py +0 -420
  288. unrealon_sdk/src/provider.py +0 -471
  289. unrealon_sdk/src/utils.py +0 -234
  290. /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
  291. /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
  292. /unrealon_browser/{src/cli → cli}/main.py +0 -0
  293. /unrealon_browser/{src/core → core}/__init__.py +0 -0
  294. /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
  295. /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
  296. /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
  297. /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
  298. /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
  299. /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
  300. /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
  301. /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
  302. /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
@@ -1,308 +0,0 @@
1
- """
2
- Language Detection Utilities
3
-
4
- Fast and accurate language detection for text content using langdetect
5
- with optimizations for short texts and technical content.
6
- """
7
-
8
- import re
9
- from typing import Dict, List, Optional, Tuple
10
-
11
- from langdetect import DetectorFactory, LangDetectException, detect, detect_langs
12
-
13
- from unrealon_llm.src.dto import LanguageCode, LanguageDetection
14
- from unrealon_llm.src.exceptions import LanguageDetectionError
15
-
16
-
17
- class LanguageDetector:
18
- """Advanced language detection with fallback strategies"""
19
-
20
- def __init__(self):
21
- """Initialize language detector with deterministic results"""
22
- # Set seed for consistent results
23
- DetectorFactory.seed = 0
24
-
25
- # Language patterns for fallback detection
26
- self.language_patterns = {
27
- LanguageCode.EN: [
28
- r'\b(the|and|or|but|in|on|at|to|for|of|with|by)\b',
29
- r'\b(this|that|these|those|what|where|when|why|how)\b',
30
- r'\b(is|are|was|were|be|been|being|have|has|had)\b'
31
- ],
32
- LanguageCode.KO: [
33
- r'[가-힣]+', # Korean characters
34
- r'\b(이|그|저|의|을|를|에|에서|으로|와|과)\b',
35
- r'\b(입니다|습니다|했습니다|있습니다|없습니다)\b'
36
- ],
37
- LanguageCode.ZH: [
38
- r'[\u4e00-\u9fff]+', # Chinese characters
39
- r'\b(的|了|在|是|我|你|他|她|我们|你们|他们)\b',
40
- r'\b(这|那|什么|哪里|什么时候|为什么|怎么)\b'
41
- ],
42
- LanguageCode.JA: [
43
- r'[ひらがな\u3040-\u309f\u30a0-\u30ff]+', # Hiragana + Katakana
44
- r'\b(の|を|に|で|から|まで|と|や|が|は)\b',
45
- r'\b(です|である|します|しました|いる|ある)\b'
46
- ],
47
- LanguageCode.RU: [
48
- r'[а-яё]+', # Cyrillic characters
49
- r'\b(и|или|но|в|на|за|для|от|с|по|о)\b',
50
- r'\b(это|тот|эти|те|что|где|когда|почему|как)\b'
51
- ],
52
- LanguageCode.ES: [
53
- r'\b(el|la|los|las|un|una|de|en|y|o|pero)\b',
54
- r'\b(que|donde|cuando|por|para|con|sin|sobre)\b',
55
- r'\b(es|son|fue|fueron|ser|estar|haber|tener)\b'
56
- ],
57
- LanguageCode.FR: [
58
- r'\b(le|la|les|un|une|des|de|du|en|et|ou)\b',
59
- r'\b(que|où|quand|pourquoi|comment|avec|sans)\b',
60
- r'\b(est|sont|était|étaient|être|avoir|faire)\b'
61
- ],
62
- LanguageCode.DE: [
63
- r'\b(der|die|das|ein|eine|und|oder|aber|in|auf)\b',
64
- r'\b(das|was|wo|wann|warum|wie|mit|ohne|für)\b',
65
- r'\b(ist|sind|war|waren|sein|haben|werden)\b'
66
- ]
67
- }
68
-
69
- def detect_language(self, text: str) -> LanguageDetection:
70
- """
71
- Detect language of given text with high accuracy
72
-
73
- Args:
74
- text: Input text to analyze
75
-
76
- Returns:
77
- LanguageDetection with detected language and confidence
78
-
79
- Raises:
80
- LanguageDetectionError: If detection fails
81
- """
82
- if not text or not text.strip():
83
- raise LanguageDetectionError("Empty text provided for language detection")
84
-
85
- # Clean text for better detection
86
- cleaned_text = self._clean_text(text)
87
-
88
- if len(cleaned_text) < 3:
89
- raise LanguageDetectionError("Text too short for reliable language detection")
90
-
91
- try:
92
- # Try primary detection with langdetect
93
- result = self._detect_with_langdetect(cleaned_text)
94
- if result.confidence >= 0.8:
95
- return result
96
-
97
- # Fallback to pattern-based detection
98
- pattern_result = self._detect_with_patterns(cleaned_text)
99
- if pattern_result.confidence >= 0.7:
100
- return pattern_result
101
-
102
- # If both methods have low confidence, use langdetect result
103
- if result.confidence > 0.5:
104
- return result
105
-
106
- # Last resort: assume English
107
- return LanguageDetection(
108
- detected_language=LanguageCode.EN,
109
- confidence=0.3,
110
- alternative_languages=[
111
- {"language": LanguageCode.EN, "confidence": 0.3}
112
- ]
113
- )
114
-
115
- except Exception as e:
116
- raise LanguageDetectionError(f"Language detection failed: {str(e)}")
117
-
118
- def detect_multiple_languages(self, text: str, top_n: int = 3) -> List[Dict[str, float]]:
119
- """
120
- Detect multiple possible languages with probabilities
121
-
122
- Args:
123
- text: Input text to analyze
124
- top_n: Number of top languages to return
125
-
126
- Returns:
127
- List of language-confidence pairs
128
- """
129
- # Use langdetect for multiple language detection
130
-
131
- try:
132
- cleaned_text = self._clean_text(text)
133
- languages = detect_langs(cleaned_text)
134
-
135
- results = []
136
- for lang_info in languages[:top_n]:
137
- # Map to our language codes
138
- our_lang_code = self._map_to_our_language_code(lang_info.lang)
139
- if our_lang_code:
140
- results.append({
141
- "language": our_lang_code,
142
- "confidence": float(lang_info.prob)
143
- })
144
-
145
- return results
146
-
147
- except LangDetectException:
148
- # Fallback to single detection
149
- single_result = self.detect_language(text)
150
- return [{"language": single_result.detected_language, "confidence": single_result.confidence}]
151
-
152
- def is_language(self, text: str, expected_language: LanguageCode, threshold: float = 0.8) -> bool:
153
- """
154
- Check if text is in expected language with given confidence threshold
155
-
156
- Args:
157
- text: Text to check
158
- expected_language: Expected language code
159
- threshold: Minimum confidence threshold
160
-
161
- Returns:
162
- True if text is likely in expected language
163
- """
164
- try:
165
- detection = self.detect_language(text)
166
- return (detection.detected_language == expected_language and
167
- detection.confidence >= threshold)
168
- except LanguageDetectionError:
169
- return False
170
-
171
- def _detect_with_langdetect(self, text: str) -> LanguageDetection:
172
- """Detect language using langdetect library"""
173
- try:
174
- # Single detection for primary language
175
- detected_lang = detect(text)
176
-
177
- # Get probabilities for all languages
178
- lang_probs = detect_langs(text)
179
-
180
- # Find our language code and confidence
181
- our_lang_code = self._map_to_our_language_code(detected_lang)
182
- confidence = 0.0
183
- alternatives = []
184
-
185
- for lang_info in lang_probs:
186
- mapped_code = self._map_to_our_language_code(lang_info.lang)
187
- if mapped_code:
188
- if mapped_code == our_lang_code:
189
- confidence = float(lang_info.prob)
190
- else:
191
- alternatives.append({
192
- "language": mapped_code,
193
- "confidence": float(lang_info.prob)
194
- })
195
-
196
- if not our_lang_code:
197
- our_lang_code = LanguageCode.EN # Default fallback
198
- confidence = 0.5
199
-
200
- return LanguageDetection(
201
- detected_language=our_lang_code,
202
- confidence=confidence,
203
- alternative_languages=alternatives
204
- )
205
-
206
- except LangDetectException as e:
207
- raise LanguageDetectionError(f"langdetect failed: {str(e)}")
208
-
209
- def _detect_with_patterns(self, text: str) -> LanguageDetection:
210
- """Fallback pattern-based language detection"""
211
- text_lower = text.lower()
212
- language_scores = {}
213
-
214
- for lang_code, patterns in self.language_patterns.items():
215
- score = 0
216
- for pattern in patterns:
217
- matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
218
- score += matches
219
-
220
- # Normalize score by text length
221
- if len(text) > 0:
222
- language_scores[lang_code] = score / len(text.split())
223
-
224
- if not language_scores:
225
- return LanguageDetection(
226
- detected_language=LanguageCode.EN,
227
- confidence=0.3,
228
- alternative_languages=[]
229
- )
230
-
231
- # Find best match
232
- best_lang = max(language_scores.items(), key=lambda x: x[1])
233
- confidence = min(best_lang[1] * 2, 1.0) # Scale confidence
234
-
235
- # Create alternatives
236
- alternatives = []
237
- for lang, score in sorted(language_scores.items(), key=lambda x: x[1], reverse=True)[1:3]:
238
- if score > 0:
239
- alternatives.append({
240
- "language": lang,
241
- "confidence": min(score * 2, 1.0)
242
- })
243
-
244
- return LanguageDetection(
245
- detected_language=best_lang[0],
246
- confidence=confidence,
247
- alternative_languages=alternatives
248
- )
249
-
250
- def _clean_text(self, text: str) -> str:
251
- """Clean text for better language detection"""
252
- # Remove URLs
253
- text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
254
-
255
- # Remove email addresses
256
- text = re.sub(r'\S+@\S+', '', text)
257
-
258
- # Remove numbers (but keep words with numbers)
259
- text = re.sub(r'\b\d+\b', '', text)
260
-
261
- # Remove excessive whitespace
262
- text = re.sub(r'\s+', ' ', text)
263
-
264
- # Remove HTML tags
265
- text = re.sub(r'<[^>]+>', '', text)
266
-
267
- return text.strip()
268
-
269
- def _map_to_our_language_code(self, langdetect_code: str) -> Optional[LanguageCode]:
270
- """Map langdetect language codes to our enum"""
271
- mapping = {
272
- 'en': LanguageCode.EN,
273
- 'ko': LanguageCode.KO,
274
- 'zh-cn': LanguageCode.ZH,
275
- 'zh': LanguageCode.ZH,
276
- 'ja': LanguageCode.JA,
277
- 'ru': LanguageCode.RU,
278
- 'es': LanguageCode.ES,
279
- 'fr': LanguageCode.FR,
280
- 'de': LanguageCode.DE,
281
- 'it': LanguageCode.IT,
282
- 'pt': LanguageCode.PT,
283
- 'ar': LanguageCode.AR,
284
- 'hi': LanguageCode.HI,
285
- 'tr': LanguageCode.TR,
286
- 'pl': LanguageCode.PL,
287
- 'uk': LanguageCode.UK,
288
- }
289
- return mapping.get(langdetect_code.lower())
290
-
291
-
292
- # Convenience functions
293
- def detect_language(text: str) -> LanguageDetection:
294
- """Quick language detection"""
295
- detector = LanguageDetector()
296
- return detector.detect_language(text)
297
-
298
-
299
- def is_language(text: str, expected_language: LanguageCode, threshold: float = 0.8) -> bool:
300
- """Quick language verification"""
301
- detector = LanguageDetector()
302
- return detector.is_language(text, expected_language, threshold)
303
-
304
-
305
- def detect_multiple_languages(text: str, top_n: int = 3) -> List[Dict[str, float]]:
306
- """Quick multiple language detection"""
307
- detector = LanguageDetector()
308
- return detector.detect_multiple_languages(text, top_n)