unrealon 1.0.9__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unrealon/__init__.py +23 -21
- unrealon-1.1.0.dist-info/METADATA +164 -0
- unrealon-1.1.0.dist-info/RECORD +82 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info}/WHEEL +1 -1
- unrealon-1.1.0.dist-info/entry_points.txt +9 -0
- {unrealon-1.0.9.dist-info → unrealon-1.1.0.dist-info/licenses}/LICENSE +1 -1
- unrealon_bridge/__init__.py +114 -0
- unrealon_bridge/cli.py +316 -0
- unrealon_bridge/client/__init__.py +93 -0
- unrealon_bridge/client/base.py +78 -0
- unrealon_bridge/client/commands.py +89 -0
- unrealon_bridge/client/connection.py +90 -0
- unrealon_bridge/client/events.py +65 -0
- unrealon_bridge/client/health.py +38 -0
- unrealon_bridge/client/html_parser.py +146 -0
- unrealon_bridge/client/logging.py +139 -0
- unrealon_bridge/client/proxy.py +70 -0
- unrealon_bridge/client/scheduler.py +450 -0
- unrealon_bridge/client/session.py +70 -0
- unrealon_bridge/configs/__init__.py +14 -0
- unrealon_bridge/configs/bridge_config.py +212 -0
- unrealon_bridge/configs/bridge_config.yaml +39 -0
- unrealon_bridge/models/__init__.py +138 -0
- unrealon_bridge/models/base.py +28 -0
- unrealon_bridge/models/command.py +41 -0
- unrealon_bridge/models/events.py +40 -0
- unrealon_bridge/models/html_parser.py +79 -0
- unrealon_bridge/models/logging.py +55 -0
- unrealon_bridge/models/parser.py +63 -0
- unrealon_bridge/models/proxy.py +41 -0
- unrealon_bridge/models/requests.py +95 -0
- unrealon_bridge/models/responses.py +88 -0
- unrealon_bridge/models/scheduler.py +592 -0
- unrealon_bridge/models/session.py +28 -0
- unrealon_bridge/server/__init__.py +91 -0
- unrealon_bridge/server/base.py +171 -0
- unrealon_bridge/server/handlers/__init__.py +23 -0
- unrealon_bridge/server/handlers/command.py +110 -0
- unrealon_bridge/server/handlers/html_parser.py +139 -0
- unrealon_bridge/server/handlers/logging.py +95 -0
- unrealon_bridge/server/handlers/parser.py +95 -0
- unrealon_bridge/server/handlers/proxy.py +75 -0
- unrealon_bridge/server/handlers/scheduler.py +545 -0
- unrealon_bridge/server/handlers/session.py +66 -0
- unrealon_browser/__init__.py +61 -18
- unrealon_browser/{src/cli → cli}/browser_cli.py +6 -13
- unrealon_browser/{src/cli → cli}/cookies_cli.py +5 -1
- unrealon_browser/{src/core → core}/browser_manager.py +2 -2
- unrealon_browser/{src/managers → managers}/captcha.py +1 -1
- unrealon_browser/{src/managers → managers}/cookies.py +1 -1
- unrealon_browser/managers/logger_bridge.py +231 -0
- unrealon_browser/{src/managers → managers}/profile.py +1 -1
- unrealon_driver/__init__.py +73 -19
- unrealon_driver/browser/__init__.py +8 -0
- unrealon_driver/browser/config.py +74 -0
- unrealon_driver/browser/manager.py +416 -0
- unrealon_driver/exceptions.py +28 -0
- unrealon_driver/parser/__init__.py +55 -0
- unrealon_driver/parser/cli_manager.py +141 -0
- unrealon_driver/parser/daemon_manager.py +227 -0
- unrealon_driver/parser/managers/__init__.py +46 -0
- unrealon_driver/parser/managers/browser.py +51 -0
- unrealon_driver/parser/managers/config.py +281 -0
- unrealon_driver/parser/managers/error.py +412 -0
- unrealon_driver/parser/managers/html.py +732 -0
- unrealon_driver/parser/managers/logging.py +609 -0
- unrealon_driver/parser/managers/result.py +321 -0
- unrealon_driver/parser/parser_manager.py +628 -0
- unrealon/sdk_config.py +0 -88
- unrealon-1.0.9.dist-info/METADATA +0 -810
- unrealon-1.0.9.dist-info/RECORD +0 -246
- unrealon_browser/pyproject.toml +0 -182
- unrealon_browser/src/__init__.py +0 -62
- unrealon_browser/src/managers/logger_bridge.py +0 -395
- unrealon_driver/README.md +0 -204
- unrealon_driver/pyproject.toml +0 -187
- unrealon_driver/src/__init__.py +0 -90
- unrealon_driver/src/cli/__init__.py +0 -10
- unrealon_driver/src/cli/main.py +0 -66
- unrealon_driver/src/cli/simple.py +0 -510
- unrealon_driver/src/config/__init__.py +0 -11
- unrealon_driver/src/config/auto_config.py +0 -478
- unrealon_driver/src/core/__init__.py +0 -18
- unrealon_driver/src/core/exceptions.py +0 -289
- unrealon_driver/src/core/parser.py +0 -638
- unrealon_driver/src/dto/__init__.py +0 -66
- unrealon_driver/src/dto/cli.py +0 -119
- unrealon_driver/src/dto/config.py +0 -18
- unrealon_driver/src/dto/events.py +0 -237
- unrealon_driver/src/dto/execution.py +0 -313
- unrealon_driver/src/dto/services.py +0 -311
- unrealon_driver/src/execution/__init__.py +0 -23
- unrealon_driver/src/execution/daemon_mode.py +0 -317
- unrealon_driver/src/execution/interactive_mode.py +0 -88
- unrealon_driver/src/execution/modes.py +0 -45
- unrealon_driver/src/execution/scheduled_mode.py +0 -209
- unrealon_driver/src/execution/test_mode.py +0 -250
- unrealon_driver/src/logging/__init__.py +0 -24
- unrealon_driver/src/logging/driver_logger.py +0 -512
- unrealon_driver/src/services/__init__.py +0 -24
- unrealon_driver/src/services/browser_service.py +0 -726
- unrealon_driver/src/services/llm/__init__.py +0 -15
- unrealon_driver/src/services/llm/browser_llm_service.py +0 -363
- unrealon_driver/src/services/llm/llm.py +0 -195
- unrealon_driver/src/services/logger_service.py +0 -232
- unrealon_driver/src/services/metrics_service.py +0 -185
- unrealon_driver/src/services/scheduler_service.py +0 -489
- unrealon_driver/src/services/websocket_service.py +0 -362
- unrealon_driver/src/utils/__init__.py +0 -16
- unrealon_driver/src/utils/service_factory.py +0 -317
- unrealon_driver/src/utils/time_formatter.py +0 -338
- unrealon_llm/README.md +0 -44
- unrealon_llm/__init__.py +0 -26
- unrealon_llm/pyproject.toml +0 -154
- unrealon_llm/src/__init__.py +0 -228
- unrealon_llm/src/cli/__init__.py +0 -0
- unrealon_llm/src/core/__init__.py +0 -11
- unrealon_llm/src/core/smart_client.py +0 -438
- unrealon_llm/src/dto/__init__.py +0 -155
- unrealon_llm/src/dto/models/__init__.py +0 -0
- unrealon_llm/src/dto/models/config.py +0 -343
- unrealon_llm/src/dto/models/core.py +0 -328
- unrealon_llm/src/dto/models/enums.py +0 -123
- unrealon_llm/src/dto/models/html_analysis.py +0 -345
- unrealon_llm/src/dto/models/statistics.py +0 -473
- unrealon_llm/src/dto/models/translation.py +0 -383
- unrealon_llm/src/dto/models/type_conversion.py +0 -462
- unrealon_llm/src/dto/schemas/__init__.py +0 -0
- unrealon_llm/src/exceptions.py +0 -392
- unrealon_llm/src/llm_config/__init__.py +0 -20
- unrealon_llm/src/llm_config/logging_config.py +0 -178
- unrealon_llm/src/llm_logging/__init__.py +0 -42
- unrealon_llm/src/llm_logging/llm_events.py +0 -107
- unrealon_llm/src/llm_logging/llm_logger.py +0 -466
- unrealon_llm/src/managers/__init__.py +0 -15
- unrealon_llm/src/managers/cache_manager.py +0 -67
- unrealon_llm/src/managers/cost_manager.py +0 -107
- unrealon_llm/src/managers/request_manager.py +0 -298
- unrealon_llm/src/modules/__init__.py +0 -0
- unrealon_llm/src/modules/html_processor/__init__.py +0 -25
- unrealon_llm/src/modules/html_processor/base_processor.py +0 -415
- unrealon_llm/src/modules/html_processor/details_processor.py +0 -85
- unrealon_llm/src/modules/html_processor/listing_processor.py +0 -91
- unrealon_llm/src/modules/html_processor/models/__init__.py +0 -20
- unrealon_llm/src/modules/html_processor/models/processing_models.py +0 -40
- unrealon_llm/src/modules/html_processor/models/universal_model.py +0 -56
- unrealon_llm/src/modules/html_processor/processor.py +0 -102
- unrealon_llm/src/modules/llm/__init__.py +0 -0
- unrealon_llm/src/modules/translator/__init__.py +0 -0
- unrealon_llm/src/provider.py +0 -116
- unrealon_llm/src/utils/__init__.py +0 -95
- unrealon_llm/src/utils/common.py +0 -64
- unrealon_llm/src/utils/data_extractor.py +0 -188
- unrealon_llm/src/utils/html_cleaner.py +0 -767
- unrealon_llm/src/utils/language_detector.py +0 -308
- unrealon_llm/src/utils/models_cache.py +0 -592
- unrealon_llm/src/utils/smart_counter.py +0 -229
- unrealon_llm/src/utils/token_counter.py +0 -189
- unrealon_sdk/README.md +0 -25
- unrealon_sdk/__init__.py +0 -30
- unrealon_sdk/pyproject.toml +0 -231
- unrealon_sdk/src/__init__.py +0 -150
- unrealon_sdk/src/cli/__init__.py +0 -12
- unrealon_sdk/src/cli/commands/__init__.py +0 -22
- unrealon_sdk/src/cli/commands/benchmark.py +0 -42
- unrealon_sdk/src/cli/commands/diagnostics.py +0 -573
- unrealon_sdk/src/cli/commands/health.py +0 -46
- unrealon_sdk/src/cli/commands/integration.py +0 -498
- unrealon_sdk/src/cli/commands/reports.py +0 -43
- unrealon_sdk/src/cli/commands/security.py +0 -36
- unrealon_sdk/src/cli/commands/server.py +0 -483
- unrealon_sdk/src/cli/commands/servers.py +0 -56
- unrealon_sdk/src/cli/commands/tests.py +0 -55
- unrealon_sdk/src/cli/main.py +0 -126
- unrealon_sdk/src/cli/utils/reporter.py +0 -519
- unrealon_sdk/src/clients/openapi.yaml +0 -3347
- unrealon_sdk/src/clients/python_http/__init__.py +0 -3
- unrealon_sdk/src/clients/python_http/api_config.py +0 -228
- unrealon_sdk/src/clients/python_http/models/BaseModel.py +0 -12
- unrealon_sdk/src/clients/python_http/models/BroadcastDeliveryStats.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastMessage.py +0 -17
- unrealon_sdk/src/clients/python_http/models/BroadcastMessageRequest.py +0 -35
- unrealon_sdk/src/clients/python_http/models/BroadcastPriority.py +0 -10
- unrealon_sdk/src/clients/python_http/models/BroadcastResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/BroadcastResultResponse.py +0 -33
- unrealon_sdk/src/clients/python_http/models/BroadcastTarget.py +0 -11
- unrealon_sdk/src/clients/python_http/models/ConnectionStats.py +0 -27
- unrealon_sdk/src/clients/python_http/models/ConnectionsResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/DeveloperMessageResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ErrorResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/HTTPValidationError.py +0 -16
- unrealon_sdk/src/clients/python_http/models/HealthResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/HealthStatus.py +0 -33
- unrealon_sdk/src/clients/python_http/models/LogLevel.py +0 -10
- unrealon_sdk/src/clients/python_http/models/LoggingRequest.py +0 -27
- unrealon_sdk/src/clients/python_http/models/LoggingResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/MaintenanceMode.py +0 -9
- unrealon_sdk/src/clients/python_http/models/MaintenanceModeRequest.py +0 -33
- unrealon_sdk/src/clients/python_http/models/MaintenanceStatusResponse.py +0 -39
- unrealon_sdk/src/clients/python_http/models/ParserCommandRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserMessageResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationRequest.py +0 -28
- unrealon_sdk/src/clients/python_http/models/ParserRegistrationResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ParserType.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyBlockRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyEndpointResponse.py +0 -20
- unrealon_sdk/src/clients/python_http/models/ProxyListResponse.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyProvider.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyPurchaseRequest.py +0 -25
- unrealon_sdk/src/clients/python_http/models/ProxyResponse.py +0 -47
- unrealon_sdk/src/clients/python_http/models/ProxyRotationRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ProxyStatus.py +0 -10
- unrealon_sdk/src/clients/python_http/models/ProxyUsageRequest.py +0 -19
- unrealon_sdk/src/clients/python_http/models/ProxyUsageStatsResponse.py +0 -26
- unrealon_sdk/src/clients/python_http/models/ServiceRegistrationDto.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ServiceStatsResponse.py +0 -31
- unrealon_sdk/src/clients/python_http/models/SessionStartRequest.py +0 -23
- unrealon_sdk/src/clients/python_http/models/SuccessResponse.py +0 -25
- unrealon_sdk/src/clients/python_http/models/SystemNotificationResponse.py +0 -23
- unrealon_sdk/src/clients/python_http/models/ValidationError.py +0 -18
- unrealon_sdk/src/clients/python_http/models/ValidationErrorResponse.py +0 -21
- unrealon_sdk/src/clients/python_http/models/WebSocketMetrics.py +0 -21
- unrealon_sdk/src/clients/python_http/models/__init__.py +0 -44
- unrealon_sdk/src/clients/python_http/services/None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/SocketLogging_service.py +0 -187
- unrealon_sdk/src/clients/python_http/services/SystemHealth_service.py +0 -119
- unrealon_sdk/src/clients/python_http/services/WebSocketAPI_service.py +0 -198
- unrealon_sdk/src/clients/python_http/services/__init__.py +0 -0
- unrealon_sdk/src/clients/python_http/services/admin_service.py +0 -125
- unrealon_sdk/src/clients/python_http/services/async_None_service.py +0 -35
- unrealon_sdk/src/clients/python_http/services/async_ParserManagement_service.py +0 -190
- unrealon_sdk/src/clients/python_http/services/async_ProxyManagement_service.py +0 -289
- unrealon_sdk/src/clients/python_http/services/async_SocketLogging_service.py +0 -189
- unrealon_sdk/src/clients/python_http/services/async_SystemHealth_service.py +0 -123
- unrealon_sdk/src/clients/python_http/services/async_WebSocketAPI_service.py +0 -200
- unrealon_sdk/src/clients/python_http/services/async_admin_service.py +0 -125
- unrealon_sdk/src/clients/python_websocket/__init__.py +0 -28
- unrealon_sdk/src/clients/python_websocket/client.py +0 -490
- unrealon_sdk/src/clients/python_websocket/events.py +0 -732
- unrealon_sdk/src/clients/python_websocket/example.py +0 -136
- unrealon_sdk/src/clients/python_websocket/types.py +0 -871
- unrealon_sdk/src/core/__init__.py +0 -64
- unrealon_sdk/src/core/client.py +0 -556
- unrealon_sdk/src/core/config.py +0 -465
- unrealon_sdk/src/core/exceptions.py +0 -239
- unrealon_sdk/src/core/metadata.py +0 -191
- unrealon_sdk/src/core/models.py +0 -142
- unrealon_sdk/src/core/types.py +0 -68
- unrealon_sdk/src/dto/__init__.py +0 -268
- unrealon_sdk/src/dto/authentication.py +0 -108
- unrealon_sdk/src/dto/cache.py +0 -208
- unrealon_sdk/src/dto/common.py +0 -19
- unrealon_sdk/src/dto/concurrency.py +0 -393
- unrealon_sdk/src/dto/events.py +0 -108
- unrealon_sdk/src/dto/health.py +0 -339
- unrealon_sdk/src/dto/load_balancing.py +0 -336
- unrealon_sdk/src/dto/logging.py +0 -230
- unrealon_sdk/src/dto/performance.py +0 -165
- unrealon_sdk/src/dto/rate_limiting.py +0 -295
- unrealon_sdk/src/dto/resource_pooling.py +0 -128
- unrealon_sdk/src/dto/structured_logging.py +0 -112
- unrealon_sdk/src/dto/task_scheduling.py +0 -121
- unrealon_sdk/src/dto/websocket.py +0 -55
- unrealon_sdk/src/enterprise/__init__.py +0 -59
- unrealon_sdk/src/enterprise/authentication.py +0 -401
- unrealon_sdk/src/enterprise/cache_manager.py +0 -578
- unrealon_sdk/src/enterprise/error_recovery.py +0 -494
- unrealon_sdk/src/enterprise/event_system.py +0 -549
- unrealon_sdk/src/enterprise/health_monitor.py +0 -747
- unrealon_sdk/src/enterprise/load_balancer.py +0 -964
- unrealon_sdk/src/enterprise/logging/__init__.py +0 -68
- unrealon_sdk/src/enterprise/logging/cleanup.py +0 -156
- unrealon_sdk/src/enterprise/logging/development.py +0 -744
- unrealon_sdk/src/enterprise/logging/service.py +0 -410
- unrealon_sdk/src/enterprise/multithreading_manager.py +0 -853
- unrealon_sdk/src/enterprise/performance_monitor.py +0 -539
- unrealon_sdk/src/enterprise/proxy_manager.py +0 -696
- unrealon_sdk/src/enterprise/rate_limiter.py +0 -652
- unrealon_sdk/src/enterprise/resource_pool.py +0 -763
- unrealon_sdk/src/enterprise/task_scheduler.py +0 -709
- unrealon_sdk/src/internal/__init__.py +0 -10
- unrealon_sdk/src/internal/command_router.py +0 -497
- unrealon_sdk/src/internal/connection_manager.py +0 -397
- unrealon_sdk/src/internal/http_client.py +0 -446
- unrealon_sdk/src/internal/websocket_client.py +0 -420
- unrealon_sdk/src/provider.py +0 -471
- unrealon_sdk/src/utils.py +0 -234
- /unrealon_browser/{src/cli → cli}/__init__.py +0 -0
- /unrealon_browser/{src/cli → cli}/interactive_mode.py +0 -0
- /unrealon_browser/{src/cli → cli}/main.py +0 -0
- /unrealon_browser/{src/core → core}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/__init__.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/config.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/core.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/dataclasses.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/detection.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/enums.py +0 -0
- /unrealon_browser/{src/dto → dto}/models/statistics.py +0 -0
- /unrealon_browser/{src/managers → managers}/__init__.py +0 -0
- /unrealon_browser/{src/managers → managers}/stealth.py +0 -0
|
@@ -1,308 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Language Detection Utilities
|
|
3
|
-
|
|
4
|
-
Fast and accurate language detection for text content using langdetect
|
|
5
|
-
with optimizations for short texts and technical content.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import re
|
|
9
|
-
from typing import Dict, List, Optional, Tuple
|
|
10
|
-
|
|
11
|
-
from langdetect import DetectorFactory, LangDetectException, detect, detect_langs
|
|
12
|
-
|
|
13
|
-
from unrealon_llm.src.dto import LanguageCode, LanguageDetection
|
|
14
|
-
from unrealon_llm.src.exceptions import LanguageDetectionError
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class LanguageDetector:
|
|
18
|
-
"""Advanced language detection with fallback strategies"""
|
|
19
|
-
|
|
20
|
-
def __init__(self):
|
|
21
|
-
"""Initialize language detector with deterministic results"""
|
|
22
|
-
# Set seed for consistent results
|
|
23
|
-
DetectorFactory.seed = 0
|
|
24
|
-
|
|
25
|
-
# Language patterns for fallback detection
|
|
26
|
-
self.language_patterns = {
|
|
27
|
-
LanguageCode.EN: [
|
|
28
|
-
r'\b(the|and|or|but|in|on|at|to|for|of|with|by)\b',
|
|
29
|
-
r'\b(this|that|these|those|what|where|when|why|how)\b',
|
|
30
|
-
r'\b(is|are|was|were|be|been|being|have|has|had)\b'
|
|
31
|
-
],
|
|
32
|
-
LanguageCode.KO: [
|
|
33
|
-
r'[가-힣]+', # Korean characters
|
|
34
|
-
r'\b(이|그|저|의|을|를|에|에서|으로|와|과)\b',
|
|
35
|
-
r'\b(입니다|습니다|했습니다|있습니다|없습니다)\b'
|
|
36
|
-
],
|
|
37
|
-
LanguageCode.ZH: [
|
|
38
|
-
r'[\u4e00-\u9fff]+', # Chinese characters
|
|
39
|
-
r'\b(的|了|在|是|我|你|他|她|我们|你们|他们)\b',
|
|
40
|
-
r'\b(这|那|什么|哪里|什么时候|为什么|怎么)\b'
|
|
41
|
-
],
|
|
42
|
-
LanguageCode.JA: [
|
|
43
|
-
r'[ひらがな\u3040-\u309f\u30a0-\u30ff]+', # Hiragana + Katakana
|
|
44
|
-
r'\b(の|を|に|で|から|まで|と|や|が|は)\b',
|
|
45
|
-
r'\b(です|である|します|しました|いる|ある)\b'
|
|
46
|
-
],
|
|
47
|
-
LanguageCode.RU: [
|
|
48
|
-
r'[а-яё]+', # Cyrillic characters
|
|
49
|
-
r'\b(и|или|но|в|на|за|для|от|с|по|о)\b',
|
|
50
|
-
r'\b(это|тот|эти|те|что|где|когда|почему|как)\b'
|
|
51
|
-
],
|
|
52
|
-
LanguageCode.ES: [
|
|
53
|
-
r'\b(el|la|los|las|un|una|de|en|y|o|pero)\b',
|
|
54
|
-
r'\b(que|donde|cuando|por|para|con|sin|sobre)\b',
|
|
55
|
-
r'\b(es|son|fue|fueron|ser|estar|haber|tener)\b'
|
|
56
|
-
],
|
|
57
|
-
LanguageCode.FR: [
|
|
58
|
-
r'\b(le|la|les|un|une|des|de|du|en|et|ou)\b',
|
|
59
|
-
r'\b(que|où|quand|pourquoi|comment|avec|sans)\b',
|
|
60
|
-
r'\b(est|sont|était|étaient|être|avoir|faire)\b'
|
|
61
|
-
],
|
|
62
|
-
LanguageCode.DE: [
|
|
63
|
-
r'\b(der|die|das|ein|eine|und|oder|aber|in|auf)\b',
|
|
64
|
-
r'\b(das|was|wo|wann|warum|wie|mit|ohne|für)\b',
|
|
65
|
-
r'\b(ist|sind|war|waren|sein|haben|werden)\b'
|
|
66
|
-
]
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
def detect_language(self, text: str) -> LanguageDetection:
|
|
70
|
-
"""
|
|
71
|
-
Detect language of given text with high accuracy
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
text: Input text to analyze
|
|
75
|
-
|
|
76
|
-
Returns:
|
|
77
|
-
LanguageDetection with detected language and confidence
|
|
78
|
-
|
|
79
|
-
Raises:
|
|
80
|
-
LanguageDetectionError: If detection fails
|
|
81
|
-
"""
|
|
82
|
-
if not text or not text.strip():
|
|
83
|
-
raise LanguageDetectionError("Empty text provided for language detection")
|
|
84
|
-
|
|
85
|
-
# Clean text for better detection
|
|
86
|
-
cleaned_text = self._clean_text(text)
|
|
87
|
-
|
|
88
|
-
if len(cleaned_text) < 3:
|
|
89
|
-
raise LanguageDetectionError("Text too short for reliable language detection")
|
|
90
|
-
|
|
91
|
-
try:
|
|
92
|
-
# Try primary detection with langdetect
|
|
93
|
-
result = self._detect_with_langdetect(cleaned_text)
|
|
94
|
-
if result.confidence >= 0.8:
|
|
95
|
-
return result
|
|
96
|
-
|
|
97
|
-
# Fallback to pattern-based detection
|
|
98
|
-
pattern_result = self._detect_with_patterns(cleaned_text)
|
|
99
|
-
if pattern_result.confidence >= 0.7:
|
|
100
|
-
return pattern_result
|
|
101
|
-
|
|
102
|
-
# If both methods have low confidence, use langdetect result
|
|
103
|
-
if result.confidence > 0.5:
|
|
104
|
-
return result
|
|
105
|
-
|
|
106
|
-
# Last resort: assume English
|
|
107
|
-
return LanguageDetection(
|
|
108
|
-
detected_language=LanguageCode.EN,
|
|
109
|
-
confidence=0.3,
|
|
110
|
-
alternative_languages=[
|
|
111
|
-
{"language": LanguageCode.EN, "confidence": 0.3}
|
|
112
|
-
]
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
except Exception as e:
|
|
116
|
-
raise LanguageDetectionError(f"Language detection failed: {str(e)}")
|
|
117
|
-
|
|
118
|
-
def detect_multiple_languages(self, text: str, top_n: int = 3) -> List[Dict[str, float]]:
|
|
119
|
-
"""
|
|
120
|
-
Detect multiple possible languages with probabilities
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
text: Input text to analyze
|
|
124
|
-
top_n: Number of top languages to return
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
List of language-confidence pairs
|
|
128
|
-
"""
|
|
129
|
-
# Use langdetect for multiple language detection
|
|
130
|
-
|
|
131
|
-
try:
|
|
132
|
-
cleaned_text = self._clean_text(text)
|
|
133
|
-
languages = detect_langs(cleaned_text)
|
|
134
|
-
|
|
135
|
-
results = []
|
|
136
|
-
for lang_info in languages[:top_n]:
|
|
137
|
-
# Map to our language codes
|
|
138
|
-
our_lang_code = self._map_to_our_language_code(lang_info.lang)
|
|
139
|
-
if our_lang_code:
|
|
140
|
-
results.append({
|
|
141
|
-
"language": our_lang_code,
|
|
142
|
-
"confidence": float(lang_info.prob)
|
|
143
|
-
})
|
|
144
|
-
|
|
145
|
-
return results
|
|
146
|
-
|
|
147
|
-
except LangDetectException:
|
|
148
|
-
# Fallback to single detection
|
|
149
|
-
single_result = self.detect_language(text)
|
|
150
|
-
return [{"language": single_result.detected_language, "confidence": single_result.confidence}]
|
|
151
|
-
|
|
152
|
-
def is_language(self, text: str, expected_language: LanguageCode, threshold: float = 0.8) -> bool:
|
|
153
|
-
"""
|
|
154
|
-
Check if text is in expected language with given confidence threshold
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
text: Text to check
|
|
158
|
-
expected_language: Expected language code
|
|
159
|
-
threshold: Minimum confidence threshold
|
|
160
|
-
|
|
161
|
-
Returns:
|
|
162
|
-
True if text is likely in expected language
|
|
163
|
-
"""
|
|
164
|
-
try:
|
|
165
|
-
detection = self.detect_language(text)
|
|
166
|
-
return (detection.detected_language == expected_language and
|
|
167
|
-
detection.confidence >= threshold)
|
|
168
|
-
except LanguageDetectionError:
|
|
169
|
-
return False
|
|
170
|
-
|
|
171
|
-
def _detect_with_langdetect(self, text: str) -> LanguageDetection:
|
|
172
|
-
"""Detect language using langdetect library"""
|
|
173
|
-
try:
|
|
174
|
-
# Single detection for primary language
|
|
175
|
-
detected_lang = detect(text)
|
|
176
|
-
|
|
177
|
-
# Get probabilities for all languages
|
|
178
|
-
lang_probs = detect_langs(text)
|
|
179
|
-
|
|
180
|
-
# Find our language code and confidence
|
|
181
|
-
our_lang_code = self._map_to_our_language_code(detected_lang)
|
|
182
|
-
confidence = 0.0
|
|
183
|
-
alternatives = []
|
|
184
|
-
|
|
185
|
-
for lang_info in lang_probs:
|
|
186
|
-
mapped_code = self._map_to_our_language_code(lang_info.lang)
|
|
187
|
-
if mapped_code:
|
|
188
|
-
if mapped_code == our_lang_code:
|
|
189
|
-
confidence = float(lang_info.prob)
|
|
190
|
-
else:
|
|
191
|
-
alternatives.append({
|
|
192
|
-
"language": mapped_code,
|
|
193
|
-
"confidence": float(lang_info.prob)
|
|
194
|
-
})
|
|
195
|
-
|
|
196
|
-
if not our_lang_code:
|
|
197
|
-
our_lang_code = LanguageCode.EN # Default fallback
|
|
198
|
-
confidence = 0.5
|
|
199
|
-
|
|
200
|
-
return LanguageDetection(
|
|
201
|
-
detected_language=our_lang_code,
|
|
202
|
-
confidence=confidence,
|
|
203
|
-
alternative_languages=alternatives
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
except LangDetectException as e:
|
|
207
|
-
raise LanguageDetectionError(f"langdetect failed: {str(e)}")
|
|
208
|
-
|
|
209
|
-
def _detect_with_patterns(self, text: str) -> LanguageDetection:
|
|
210
|
-
"""Fallback pattern-based language detection"""
|
|
211
|
-
text_lower = text.lower()
|
|
212
|
-
language_scores = {}
|
|
213
|
-
|
|
214
|
-
for lang_code, patterns in self.language_patterns.items():
|
|
215
|
-
score = 0
|
|
216
|
-
for pattern in patterns:
|
|
217
|
-
matches = len(re.findall(pattern, text_lower, re.IGNORECASE))
|
|
218
|
-
score += matches
|
|
219
|
-
|
|
220
|
-
# Normalize score by text length
|
|
221
|
-
if len(text) > 0:
|
|
222
|
-
language_scores[lang_code] = score / len(text.split())
|
|
223
|
-
|
|
224
|
-
if not language_scores:
|
|
225
|
-
return LanguageDetection(
|
|
226
|
-
detected_language=LanguageCode.EN,
|
|
227
|
-
confidence=0.3,
|
|
228
|
-
alternative_languages=[]
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
# Find best match
|
|
232
|
-
best_lang = max(language_scores.items(), key=lambda x: x[1])
|
|
233
|
-
confidence = min(best_lang[1] * 2, 1.0) # Scale confidence
|
|
234
|
-
|
|
235
|
-
# Create alternatives
|
|
236
|
-
alternatives = []
|
|
237
|
-
for lang, score in sorted(language_scores.items(), key=lambda x: x[1], reverse=True)[1:3]:
|
|
238
|
-
if score > 0:
|
|
239
|
-
alternatives.append({
|
|
240
|
-
"language": lang,
|
|
241
|
-
"confidence": min(score * 2, 1.0)
|
|
242
|
-
})
|
|
243
|
-
|
|
244
|
-
return LanguageDetection(
|
|
245
|
-
detected_language=best_lang[0],
|
|
246
|
-
confidence=confidence,
|
|
247
|
-
alternative_languages=alternatives
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
def _clean_text(self, text: str) -> str:
|
|
251
|
-
"""Clean text for better language detection"""
|
|
252
|
-
# Remove URLs
|
|
253
|
-
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
|
254
|
-
|
|
255
|
-
# Remove email addresses
|
|
256
|
-
text = re.sub(r'\S+@\S+', '', text)
|
|
257
|
-
|
|
258
|
-
# Remove numbers (but keep words with numbers)
|
|
259
|
-
text = re.sub(r'\b\d+\b', '', text)
|
|
260
|
-
|
|
261
|
-
# Remove excessive whitespace
|
|
262
|
-
text = re.sub(r'\s+', ' ', text)
|
|
263
|
-
|
|
264
|
-
# Remove HTML tags
|
|
265
|
-
text = re.sub(r'<[^>]+>', '', text)
|
|
266
|
-
|
|
267
|
-
return text.strip()
|
|
268
|
-
|
|
269
|
-
def _map_to_our_language_code(self, langdetect_code: str) -> Optional[LanguageCode]:
|
|
270
|
-
"""Map langdetect language codes to our enum"""
|
|
271
|
-
mapping = {
|
|
272
|
-
'en': LanguageCode.EN,
|
|
273
|
-
'ko': LanguageCode.KO,
|
|
274
|
-
'zh-cn': LanguageCode.ZH,
|
|
275
|
-
'zh': LanguageCode.ZH,
|
|
276
|
-
'ja': LanguageCode.JA,
|
|
277
|
-
'ru': LanguageCode.RU,
|
|
278
|
-
'es': LanguageCode.ES,
|
|
279
|
-
'fr': LanguageCode.FR,
|
|
280
|
-
'de': LanguageCode.DE,
|
|
281
|
-
'it': LanguageCode.IT,
|
|
282
|
-
'pt': LanguageCode.PT,
|
|
283
|
-
'ar': LanguageCode.AR,
|
|
284
|
-
'hi': LanguageCode.HI,
|
|
285
|
-
'tr': LanguageCode.TR,
|
|
286
|
-
'pl': LanguageCode.PL,
|
|
287
|
-
'uk': LanguageCode.UK,
|
|
288
|
-
}
|
|
289
|
-
return mapping.get(langdetect_code.lower())
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
# Convenience functions
|
|
293
|
-
def detect_language(text: str) -> LanguageDetection:
|
|
294
|
-
"""Quick language detection"""
|
|
295
|
-
detector = LanguageDetector()
|
|
296
|
-
return detector.detect_language(text)
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
def is_language(text: str, expected_language: LanguageCode, threshold: float = 0.8) -> bool:
|
|
300
|
-
"""Quick language verification"""
|
|
301
|
-
detector = LanguageDetector()
|
|
302
|
-
return detector.is_language(text, expected_language, threshold)
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
def detect_multiple_languages(text: str, top_n: int = 3) -> List[Dict[str, float]]:
|
|
306
|
-
"""Quick multiple language detection"""
|
|
307
|
-
detector = LanguageDetector()
|
|
308
|
-
return detector.detect_multiple_languages(text, top_n)
|