jtcg_locale_detector 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +37 -0
- data/PACKAGING_SUMMARY.md +195 -0
- data/README.md +226 -0
- data/bin/locale-detector +159 -0
- data/jtcg_locale_detector.gemspec +48 -0
- data/lib/locale_detector/client.rb +163 -0
- data/lib/locale_detector/detector.rb +46 -0
- data/lib/locale_detector/version.rb +3 -0
- data/lib/locale_detector.rb +25 -0
- data/locale_detector.gemspec +46 -0
- data/python/cli.py +220 -0
- data/python/requirements.txt +8 -0
- data/python/src/__init__.py +10 -0
- data/python/src/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/src/__pycache__/__init__.cpython-313.pyc +0 -0
- data/python/src/__pycache__/locale_data.cpython-311.pyc +0 -0
- data/python/src/__pycache__/locale_data.cpython-313.pyc +0 -0
- data/python/src/__pycache__/locale_detector.cpython-311.pyc +0 -0
- data/python/src/__pycache__/locale_detector.cpython-313.pyc +0 -0
- data/python/src/artifacts/fasttext/lid.176.bin +0 -0
- data/python/src/artifacts/fasttext/lid.176.ftz +0 -0
- data/python/src/download_fasttext.py +69 -0
- data/python/src/locale_data.py +178 -0
- data/python/src/locale_detector.py +534 -0
- data/python/src/locale_detector_c.c +403 -0
- data/python/src/locale_detector_c.h +37 -0
- data/python/src/locale_detector_cy.cpp +23126 -0
- data/python/src/locale_detector_cy.cpython-311-darwin.so +0 -0
- data/python/src/locale_detector_cy.cpython-313-darwin.so +0 -0
- data/python/src/locale_detector_cy.html +6460 -0
- data/python/src/locale_detector_cy.pyx +501 -0
- data/python/src/utils/__init__.py +1 -0
- data/python/src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
- data/python/src/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
- data/python/src/utils/__pycache__/data_utils.cpython-313.pyc +0 -0
- data/python/src/utils/data_utils.py +50 -0
- data/python/src/utils/data_utils_cy.cpp +10086 -0
- data/python/src/utils/data_utils_cy.cpython-311-darwin.so +0 -0
- data/python/src/utils/data_utils_cy.cpython-313-darwin.so +0 -0
- data/python/src/utils/data_utils_cy.html +600 -0
- data/python/src/utils/data_utils_cy.pyx +94 -0
- data/python/src/zhon/__init__.py +7 -0
- data/python/src/zhon/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/src/zhon/__pycache__/hanzi.cpython-311.pyc +0 -0
- data/python/src/zhon/__pycache__/pinyin.cpython-311.pyc +0 -0
- data/python/src/zhon/__pycache__/zhuyin.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/__init__.py +14 -0
- data/python/src/zhon/cedict/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/__pycache__/all.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/__pycache__/simplified.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/__pycache__/traditional.cpython-311.pyc +0 -0
- data/python/src/zhon/cedict/all.py +4 -0
- data/python/src/zhon/cedict/simplified.py +4 -0
- data/python/src/zhon/cedict/traditional.py +4 -0
- data/python/src/zhon/hanzi.py +81 -0
- data/python/src/zhon/pinyin.py +187 -0
- data/python/src/zhon/zhuyin.py +46 -0
- metadata +198 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
require "json"
|
|
2
|
+
require "open3"
|
|
3
|
+
require "tempfile"
|
|
4
|
+
|
|
5
|
+
module LocaleDetector
|
|
6
|
+
class Client
|
|
7
|
+
attr_reader :options
|
|
8
|
+
|
|
9
|
+
def initialize(**options)
|
|
10
|
+
@options = default_options.merge(options)
|
|
11
|
+
validate_python_environment
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Detect locale for a single text
|
|
15
|
+
def detect(text)
|
|
16
|
+
return nil if text.nil? || text.strip.empty?
|
|
17
|
+
|
|
18
|
+
result = execute_detection(text: text)
|
|
19
|
+
parse_result(result)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Detect locales for multiple texts
|
|
23
|
+
def detect_batch(texts)
|
|
24
|
+
return [] if texts.nil? || texts.empty?
|
|
25
|
+
|
|
26
|
+
texts.map { |text| detect(text) }
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Detect locale from file
|
|
30
|
+
def detect_file(file_path)
|
|
31
|
+
raise Error, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
32
|
+
|
|
33
|
+
result = execute_detection(file: file_path)
|
|
34
|
+
parse_result(result)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Get detailed detection information
|
|
38
|
+
def detect_with_details(text)
|
|
39
|
+
return nil if text.nil? || text.strip.empty?
|
|
40
|
+
|
|
41
|
+
result = execute_detection(text: text, details: true)
|
|
42
|
+
parse_detailed_result(result)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def default_options
|
|
48
|
+
{
|
|
49
|
+
mode: "ratio", # conversion, ratio, both
|
|
50
|
+
low_memory: false, # use low memory mode
|
|
51
|
+
output_format: "json", # simple, json, detailed
|
|
52
|
+
encoding: "utf-8", # file encoding
|
|
53
|
+
python_path: find_python_executable,
|
|
54
|
+
cli_path: find_cli_path
|
|
55
|
+
}
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def find_python_executable
|
|
59
|
+
# Try to find Python executable
|
|
60
|
+
%w[python3 python].each do |cmd|
|
|
61
|
+
_output, status = Open3.capture2e("which #{cmd}")
|
|
62
|
+
return cmd if status.success?
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
raise Error, "Python executable not found. Please install Python 3.11+."
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def find_cli_path
|
|
69
|
+
# Look for the CLI script in the gem directory
|
|
70
|
+
gem_root = File.expand_path("../..", __dir__)
|
|
71
|
+
cli_candidates = [
|
|
72
|
+
File.join(gem_root, "python", "cli.py"),
|
|
73
|
+
File.join(File.dirname(gem_root), "cli.py"),
|
|
74
|
+
File.join(gem_root, "..", "cli.py") # Parent directory
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
cli_candidates.each do |path|
|
|
78
|
+
return File.expand_path(path) if File.exist?(path)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
raise Error, "CLI script not found. Please ensure the Python components are installed."
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def validate_python_environment
|
|
85
|
+
# Check if Python is available
|
|
86
|
+
output, status = Open3.capture2e("#{@options[:python_path]} --version")
|
|
87
|
+
raise Error, "Python is not available: #{output}" unless status.success?
|
|
88
|
+
|
|
89
|
+
# Check Python version (should be 3.11+)
|
|
90
|
+
version_match = output.match(/Python (\d+\.\d+)/)
|
|
91
|
+
return unless version_match
|
|
92
|
+
|
|
93
|
+
version = version_match[1].to_f
|
|
94
|
+
return unless version < 3.11
|
|
95
|
+
|
|
96
|
+
raise Error, "Python version must be 3.11 or higher. Found: #{version}"
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def execute_detection(text: nil, file: nil, details: false)
|
|
100
|
+
cmd_args = build_command_args(text: text, file: file, details: details)
|
|
101
|
+
|
|
102
|
+
output, error, status = Open3.capture3(*cmd_args)
|
|
103
|
+
|
|
104
|
+
raise Error, "Detection failed: #{error}" unless status.success?
|
|
105
|
+
|
|
106
|
+
output.strip
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def build_command_args(text: nil, file: nil, details: false)
|
|
110
|
+
args = [@options[:python_path], @options[:cli_path]]
|
|
111
|
+
|
|
112
|
+
if text
|
|
113
|
+
args << text
|
|
114
|
+
elsif file
|
|
115
|
+
args += ["--file", file]
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
args += ["--mode", @options[:mode]]
|
|
119
|
+
args += ["--output-format", @options[:output_format]]
|
|
120
|
+
args += ["--encoding", @options[:encoding]]
|
|
121
|
+
args << "--low-memory" if @options[:low_memory]
|
|
122
|
+
args << "--details" if details
|
|
123
|
+
|
|
124
|
+
args
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def parse_result(output)
|
|
128
|
+
case @options[:output_format]
|
|
129
|
+
when "json"
|
|
130
|
+
begin
|
|
131
|
+
JSON.parse(output)
|
|
132
|
+
rescue JSON::ParserError => e
|
|
133
|
+
raise Error, "Failed to parse JSON response: #{e.message}"
|
|
134
|
+
end
|
|
135
|
+
when "detailed"
|
|
136
|
+
parse_detailed_output(output)
|
|
137
|
+
else # "simple" or any other format
|
|
138
|
+
{ "locale" => output.strip }
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def parse_detailed_result(output)
|
|
143
|
+
JSON.parse(output)
|
|
144
|
+
rescue JSON::ParserError => e
|
|
145
|
+
raise Error, "Failed to parse detailed JSON response: #{e.message}"
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def parse_detailed_output(output)
|
|
149
|
+
# Parse detailed text output format
|
|
150
|
+
lines = output.split("\n")
|
|
151
|
+
result = {}
|
|
152
|
+
|
|
153
|
+
lines.each do |line|
|
|
154
|
+
if line.include?(":")
|
|
155
|
+
key, value = line.split(":", 2)
|
|
156
|
+
result[key.strip.downcase.gsub(" ", "_")] = value.strip
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
result
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
module LocaleDetector
|
|
2
|
+
class Detector
|
|
3
|
+
SUPPORTED_LANGUAGES = %w[
|
|
4
|
+
af am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
|
|
5
|
+
ca cbk ce ceb ckb co cs cv cy da de diq dsb dv dz ee el eml en eo es et
|
|
6
|
+
eu ext fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy
|
|
7
|
+
ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez
|
|
8
|
+
li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn
|
|
9
|
+
nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro
|
|
10
|
+
ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw szl ta te tg th
|
|
11
|
+
tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo
|
|
12
|
+
yue zh zu
|
|
13
|
+
].freeze
|
|
14
|
+
|
|
15
|
+
CHINESE_VARIANTS = {
|
|
16
|
+
"zh-TW" => "Traditional Chinese (Taiwan)",
|
|
17
|
+
"zh-CN" => "Simplified Chinese (China)",
|
|
18
|
+
"zh-HK" => "Traditional Chinese (Hong Kong)",
|
|
19
|
+
"zh-MO" => "Traditional Chinese (Macau)",
|
|
20
|
+
"zh-SG" => "Simplified Chinese (Singapore)"
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
DETECTION_MODES = %w[conversion ratio both].freeze
|
|
24
|
+
|
|
25
|
+
def self.supported_languages
|
|
26
|
+
SUPPORTED_LANGUAGES
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.chinese_variants
|
|
30
|
+
CHINESE_VARIANTS
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.detection_modes
|
|
34
|
+
DETECTION_MODES
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.language_supported?(language_code)
|
|
38
|
+
base_language = language_code.split("-").first
|
|
39
|
+
SUPPORTED_LANGUAGES.include?(base_language)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def self.chinese_variant?(locale)
|
|
43
|
+
CHINESE_VARIANTS.key?(locale)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require_relative "locale_detector/version"
|
|
2
|
+
require_relative "locale_detector/client"
|
|
3
|
+
require_relative "locale_detector/detector"
|
|
4
|
+
|
|
5
|
+
module LocaleDetector
|
|
6
|
+
class Error < StandardError; end
|
|
7
|
+
|
|
8
|
+
# Convenience method for quick detection
|
|
9
|
+
def self.detect(text, **options)
|
|
10
|
+
client = Client.new(**options)
|
|
11
|
+
client.detect(text)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Batch detection
|
|
15
|
+
def self.detect_batch(texts, **options)
|
|
16
|
+
client = Client.new(**options)
|
|
17
|
+
client.detect_batch(texts)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Detect from file
|
|
21
|
+
def self.detect_file(file_path, **options)
|
|
22
|
+
client = Client.new(**options)
|
|
23
|
+
client.detect_file(file_path)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Gem::Specification.new do |spec|
|
|
2
|
+
spec.name = "jtcg_locale_detector"
|
|
3
|
+
|
|
4
|
+
spec.summary = "多語言地區檢測工具,特別優化中文繁簡體檢測"
|
|
5
|
+
spec.description = "Multi-language locale detector with specialized Chinese variant detection. " \
|
|
6
|
+
"Uses FastText for initial language identification and multiple algorithms " \
|
|
7
|
+
"for Chinese variant detection. Ruby gem packaged by JTCG Team."
|
|
8
|
+
spec.version = "1.0.1"
|
|
9
|
+
spec.authors = ["JTCG Team"]
|
|
10
|
+
spec.email = ["enor@j-tcg.com"]
|
|
11
|
+
|
|
12
|
+
spec.summary = "多語言地區檢測工具,特別優化中文繁簡體檢測"
|
|
13
|
+
spec.description = "Multi-language locale detector with specialized Chinese variant detection. " \
|
|
14
|
+
"Uses FastText for initial language identification and multiple algorithms " \
|
|
15
|
+
"for Chinese variant detection."
|
|
16
|
+
spec.homepage = "https://github.com/jtcg/locale-detector"
|
|
17
|
+
spec.license = "MIT"
|
|
18
|
+
spec.required_ruby_version = ">= 2.6.0"
|
|
19
|
+
|
|
20
|
+
# 作者角色說明
|
|
21
|
+
spec.metadata["packaged_by"] = "JTCG Team <enor@j-tcg.com>"
|
|
22
|
+
spec.metadata["original_author"] = "Original Author <mark@j-tcg.com>"
|
|
23
|
+
spec.metadata["maintainer"] = "JTCG Team"
|
|
24
|
+
|
|
25
|
+
# Specify which files should be added to the gem when it is released.
|
|
26
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
|
27
|
+
Dir["{bin,lib,python}/**/*", "*.md", "*.txt", "*.gemspec"]
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
spec.bindir = "bin"
|
|
31
|
+
spec.executables = ["locale-detector"]
|
|
32
|
+
spec.require_paths = ["lib"]
|
|
33
|
+
|
|
34
|
+
# Dependencies
|
|
35
|
+
spec.add_dependency "ffi", "~> 1.15"
|
|
36
|
+
spec.add_dependency "json", "~> 2.6"
|
|
37
|
+
|
|
38
|
+
# Development dependencies
|
|
39
|
+
spec.add_development_dependency "bundler", "~> 1.17"
|
|
40
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
|
41
|
+
spec.add_development_dependency "rspec", "~> 3.12"
|
|
42
|
+
spec.add_development_dependency "rubocop", "~> 1.50"
|
|
43
|
+
|
|
44
|
+
# Metadata
|
|
45
|
+
spec.metadata["rubygems_mfa_required"] = "true"
|
|
46
|
+
end
|
data/python/cli.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LocaleDetector CLI - 多語言地區檢測命令列工具
|
|
4
|
+
|
|
5
|
+
支援檢測多種語言,特別針對中文繁簡體變體檢測進行優化。
|
|
6
|
+
使用 FastText 進行初始語言識別,再對中文進行特化檢測。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import asyncio
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
|
|
15
|
+
# 若在 PyInstaller 執行環境,自動將 _internal/src 加入 sys.path
|
|
16
|
+
if hasattr(sys, "_MEIPASS"):
|
|
17
|
+
cython_mod_path = os.path.join(sys._MEIPASS, "src")
|
|
18
|
+
if cython_mod_path not in sys.path:
|
|
19
|
+
sys.path.insert(0, cython_mod_path)
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Literal
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from src.locale_detector_cy import LocaleDetector
|
|
25
|
+
except ImportError:
|
|
26
|
+
from src.locale_detector import LocaleDetector
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_parser() -> argparse.ArgumentParser:
|
|
30
|
+
"""創建命令列參數解析器"""
|
|
31
|
+
parser = argparse.ArgumentParser(
|
|
32
|
+
description="多語言地區檢測工具 - 特別優化中文繁簡體檢測",
|
|
33
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
34
|
+
epilog="""
|
|
35
|
+
使用範例:
|
|
36
|
+
%(prog)s "Hello world" # 檢測單一文本
|
|
37
|
+
%(prog)s -f input.txt # 從檔案讀取
|
|
38
|
+
%(prog)s "你好世界" --details # 顯示詳細信息
|
|
39
|
+
%(prog)s "中文文本" --mode ratio # 使用比例分析模式
|
|
40
|
+
%(prog)s --batch file1.txt file2.txt # 批次處理多個檔案
|
|
41
|
+
""",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# 文本輸入選項
|
|
45
|
+
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
46
|
+
input_group.add_argument("text", nargs="?", help="要檢測的文本內容")
|
|
47
|
+
input_group.add_argument("-f", "--file", type=Path, help="從檔案讀取文本進行檢測")
|
|
48
|
+
input_group.add_argument("--batch", nargs="+", type=Path, help="批次處理多個檔案")
|
|
49
|
+
|
|
50
|
+
# 檢測模式選項
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--mode",
|
|
53
|
+
choices=["conversion", "ratio", "both"],
|
|
54
|
+
default="ratio",
|
|
55
|
+
help="中文檢測模式: conversion(快速), ratio(準確), both(混合) [預設: ratio]",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# 記憶體選項
|
|
59
|
+
parser.add_argument("--low-memory", action="store_true", help="使用低記憶體模式 (使用較小的 FastText 模型)")
|
|
60
|
+
|
|
61
|
+
# 輸出格式選項
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--output-format",
|
|
64
|
+
choices=["simple", "json", "detailed"],
|
|
65
|
+
default="simple",
|
|
66
|
+
help="輸出格式: simple(簡單), json(JSON格式), detailed(詳細信息) [預設: simple]",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# 詳細信息選項
|
|
70
|
+
parser.add_argument("--details", action="store_true", help="顯示檢測詳細信息(包含信心分數)")
|
|
71
|
+
|
|
72
|
+
# 輸出檔案選項
|
|
73
|
+
parser.add_argument("-o", "--output", type=Path, help="將結果輸出到檔案")
|
|
74
|
+
|
|
75
|
+
# 編碼選項
|
|
76
|
+
parser.add_argument("--encoding", default="utf-8", help="檔案編碼格式 [預設: utf-8]")
|
|
77
|
+
|
|
78
|
+
return parser
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def read_text_file(file_path: Path, encoding: str = "utf-8") -> str:
|
|
82
|
+
"""讀取文本檔案"""
|
|
83
|
+
try:
|
|
84
|
+
return file_path.read_text(encoding=encoding).strip()
|
|
85
|
+
except UnicodeDecodeError:
|
|
86
|
+
# 嘗試其他編碼
|
|
87
|
+
for fallback_encoding in ["gbk", "big5", "latin1"]:
|
|
88
|
+
try:
|
|
89
|
+
return file_path.read_text(encoding=fallback_encoding).strip()
|
|
90
|
+
except UnicodeDecodeError:
|
|
91
|
+
continue
|
|
92
|
+
raise ValueError(f"無法解碼檔案 {file_path},請指定正確的編碼格式")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def format_output(result: dict | str, output_format: str, show_details: bool) -> str:
|
|
96
|
+
"""格式化輸出結果"""
|
|
97
|
+
if isinstance(result, str):
|
|
98
|
+
# 簡單字串結果
|
|
99
|
+
if output_format == "json":
|
|
100
|
+
return json.dumps({"locale": result}, ensure_ascii=False, indent=2)
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
# 詳細結果字典
|
|
104
|
+
if output_format == "json":
|
|
105
|
+
return json.dumps(result, ensure_ascii=False, indent=2)
|
|
106
|
+
if output_format == "detailed" or show_details:
|
|
107
|
+
locale = result.get("locale", "unknown")
|
|
108
|
+
language = result.get("language", "unknown")
|
|
109
|
+
score = result.get("score", 0.0)
|
|
110
|
+
return f"地區: {locale} | 語言: {language} | 信心分數: {score:.3f}"
|
|
111
|
+
return result.get("locale", "unknown")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def write_output(content: str, output_file: Path | None, encoding: str = "utf-8") -> None:
|
|
115
|
+
"""寫入輸出"""
|
|
116
|
+
if output_file:
|
|
117
|
+
output_file.write_text(content, encoding=encoding)
|
|
118
|
+
print(f"結果已儲存至: {output_file}")
|
|
119
|
+
else:
|
|
120
|
+
print(content)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
async def process_single_text(
|
|
124
|
+
detector: LocaleDetector, text: str, mode: Literal["conversion", "ratio", "both"], show_details: bool
|
|
125
|
+
) -> dict | str:
|
|
126
|
+
"""處理單一文本"""
|
|
127
|
+
if show_details:
|
|
128
|
+
return await detector.adetect_with_details(text, mode=mode)
|
|
129
|
+
return await detector.adetect(text)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
async def process_batch(
|
|
133
|
+
detector: LocaleDetector, file_paths: list[Path], mode: Literal["conversion", "ratio", "both"], encoding: str, show_details: bool
|
|
134
|
+
) -> list[dict]:
|
|
135
|
+
"""批次處理多個檔案"""
|
|
136
|
+
results = []
|
|
137
|
+
|
|
138
|
+
for file_path in file_paths:
|
|
139
|
+
try:
|
|
140
|
+
text = read_text_file(file_path, encoding)
|
|
141
|
+
if show_details:
|
|
142
|
+
result = await detector.adetect_with_details(text, mode=mode)
|
|
143
|
+
result["file"] = str(file_path)
|
|
144
|
+
else:
|
|
145
|
+
locale = await detector.adetect(text)
|
|
146
|
+
result = {"file": str(file_path), "locale": locale}
|
|
147
|
+
results.append(result)
|
|
148
|
+
except Exception as e:
|
|
149
|
+
error_result = {"file": str(file_path), "error": str(e), "locale": "error"}
|
|
150
|
+
results.append(error_result)
|
|
151
|
+
|
|
152
|
+
return results
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
async def main() -> None:
|
|
156
|
+
"""主函數"""
|
|
157
|
+
parser = create_parser()
|
|
158
|
+
args = parser.parse_args()
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
# 初始化檢測器
|
|
162
|
+
print("正在初始化 LocaleDetector...", file=sys.stderr)
|
|
163
|
+
detector = LocaleDetector(low_memory=args.low_memory)
|
|
164
|
+
|
|
165
|
+
# 處理輸入
|
|
166
|
+
if args.batch:
|
|
167
|
+
# 批次處理
|
|
168
|
+
print(f"正在處理 {len(args.batch)} 個檔案...", file=sys.stderr)
|
|
169
|
+
results = await process_batch(detector, args.batch, args.mode, args.encoding, args.details)
|
|
170
|
+
|
|
171
|
+
if args.output_format == "json":
|
|
172
|
+
output = json.dumps(results, ensure_ascii=False, indent=2)
|
|
173
|
+
else:
|
|
174
|
+
output_lines = []
|
|
175
|
+
for result in results:
|
|
176
|
+
if "error" in result:
|
|
177
|
+
line = f"{result['file']}: ERROR - {result['error']}"
|
|
178
|
+
else:
|
|
179
|
+
formatted = format_output(result, args.output_format, args.details)
|
|
180
|
+
if args.details or args.output_format == "detailed":
|
|
181
|
+
line = f"{result['file']}: {formatted}"
|
|
182
|
+
else:
|
|
183
|
+
line = f"{result['file']}: {result['locale']}"
|
|
184
|
+
output_lines.append(line)
|
|
185
|
+
output = "\n".join(output_lines)
|
|
186
|
+
|
|
187
|
+
else:
|
|
188
|
+
# 單一文本處理
|
|
189
|
+
text = read_text_file(args.file, args.encoding) if args.file else args.text
|
|
190
|
+
|
|
191
|
+
if not text.strip():
|
|
192
|
+
print("錯誤: 輸入文本為空", file=sys.stderr)
|
|
193
|
+
sys.exit(1)
|
|
194
|
+
|
|
195
|
+
print("正在檢測語言...", file=sys.stderr)
|
|
196
|
+
result = await process_single_text(detector, text, args.mode, args.details)
|
|
197
|
+
output = format_output(result, args.output_format, args.details)
|
|
198
|
+
|
|
199
|
+
# 輸出結果
|
|
200
|
+
write_output(output, args.output, args.encoding)
|
|
201
|
+
|
|
202
|
+
except KeyboardInterrupt:
|
|
203
|
+
print("\n程式被使用者中斷", file=sys.stderr)
|
|
204
|
+
sys.exit(1)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
print(f"錯誤: {e}", file=sys.stderr)
|
|
207
|
+
sys.exit(1)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def cli_entry_point() -> None:
|
|
211
|
+
"""CLI 入口點,用於打包後的執行檔"""
|
|
212
|
+
try:
|
|
213
|
+
asyncio.run(main())
|
|
214
|
+
except RuntimeError:
|
|
215
|
+
loop = asyncio.get_event_loop()
|
|
216
|
+
loop.run_until_complete(main())
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == "__main__":
|
|
220
|
+
cli_entry_point()
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def download_file(url: str, target_path: Path, desc: str):
|
|
9
|
+
"""
|
|
10
|
+
Download a file with progress bar
|
|
11
|
+
"""
|
|
12
|
+
response = requests.get(url, stream=True)
|
|
13
|
+
total_size = int(response.headers.get("content-length", 0))
|
|
14
|
+
|
|
15
|
+
# Ensure the directory exists
|
|
16
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
17
|
+
|
|
18
|
+
# Show download progress
|
|
19
|
+
with (
|
|
20
|
+
open(target_path, "wb") as file,
|
|
21
|
+
tqdm(
|
|
22
|
+
desc=desc,
|
|
23
|
+
total=total_size,
|
|
24
|
+
unit="iB",
|
|
25
|
+
unit_scale=True,
|
|
26
|
+
unit_divisor=1024,
|
|
27
|
+
) as pbar,
|
|
28
|
+
):
|
|
29
|
+
for data in response.iter_content(chunk_size=1024):
|
|
30
|
+
size = file.write(data)
|
|
31
|
+
pbar.update(size)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def main():
|
|
35
|
+
# Define the target directory
|
|
36
|
+
current_dir = Path(__file__).resolve().parents[2]
|
|
37
|
+
target_dir = current_dir / "tools" / "locales" / "artifacts" / "fasttext"
|
|
38
|
+
|
|
39
|
+
# Model URLs and filenames
|
|
40
|
+
models = {
|
|
41
|
+
"lid.176.ftz": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz",
|
|
42
|
+
"lid.176.bin": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
print(f"Downloading FastText models to: {target_dir}")
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
# Download each model
|
|
49
|
+
for filename, url in models.items():
|
|
50
|
+
target_path = target_dir / filename
|
|
51
|
+
|
|
52
|
+
if target_path.exists():
|
|
53
|
+
print(f"Model {filename} already exists, skipping...")
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
print(f"Downloading {filename}...")
|
|
57
|
+
download_file(url, target_path, f"Downloading {filename}")
|
|
58
|
+
print(f"Successfully downloaded {filename}")
|
|
59
|
+
|
|
60
|
+
print("\nAll models downloaded successfully!")
|
|
61
|
+
print(f"Models are located in: {target_dir}")
|
|
62
|
+
|
|
63
|
+
except Exception as e:
|
|
64
|
+
print(f"Error downloading models: {e!s}", file=sys.stderr)
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
main()
|