jtcg_locale_detector 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +37 -0
  3. data/PACKAGING_SUMMARY.md +195 -0
  4. data/README.md +226 -0
  5. data/bin/locale-detector +159 -0
  6. data/jtcg_locale_detector.gemspec +48 -0
  7. data/lib/locale_detector/client.rb +163 -0
  8. data/lib/locale_detector/detector.rb +46 -0
  9. data/lib/locale_detector/version.rb +3 -0
  10. data/lib/locale_detector.rb +25 -0
  11. data/locale_detector.gemspec +46 -0
  12. data/python/cli.py +220 -0
  13. data/python/requirements.txt +8 -0
  14. data/python/src/__init__.py +10 -0
  15. data/python/src/__pycache__/__init__.cpython-311.pyc +0 -0
  16. data/python/src/__pycache__/__init__.cpython-313.pyc +0 -0
  17. data/python/src/__pycache__/locale_data.cpython-311.pyc +0 -0
  18. data/python/src/__pycache__/locale_data.cpython-313.pyc +0 -0
  19. data/python/src/__pycache__/locale_detector.cpython-311.pyc +0 -0
  20. data/python/src/__pycache__/locale_detector.cpython-313.pyc +0 -0
  21. data/python/src/artifacts/fasttext/lid.176.bin +0 -0
  22. data/python/src/artifacts/fasttext/lid.176.ftz +0 -0
  23. data/python/src/download_fasttext.py +69 -0
  24. data/python/src/locale_data.py +178 -0
  25. data/python/src/locale_detector.py +534 -0
  26. data/python/src/locale_detector_c.c +403 -0
  27. data/python/src/locale_detector_c.h +37 -0
  28. data/python/src/locale_detector_cy.cpp +23126 -0
  29. data/python/src/locale_detector_cy.cpython-311-darwin.so +0 -0
  30. data/python/src/locale_detector_cy.cpython-313-darwin.so +0 -0
  31. data/python/src/locale_detector_cy.html +6460 -0
  32. data/python/src/locale_detector_cy.pyx +501 -0
  33. data/python/src/utils/__init__.py +1 -0
  34. data/python/src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  35. data/python/src/utils/__pycache__/__init__.cpython-313.pyc +0 -0
  36. data/python/src/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
  37. data/python/src/utils/__pycache__/data_utils.cpython-313.pyc +0 -0
  38. data/python/src/utils/data_utils.py +50 -0
  39. data/python/src/utils/data_utils_cy.cpp +10086 -0
  40. data/python/src/utils/data_utils_cy.cpython-311-darwin.so +0 -0
  41. data/python/src/utils/data_utils_cy.cpython-313-darwin.so +0 -0
  42. data/python/src/utils/data_utils_cy.html +600 -0
  43. data/python/src/utils/data_utils_cy.pyx +94 -0
  44. data/python/src/zhon/__init__.py +7 -0
  45. data/python/src/zhon/__pycache__/__init__.cpython-311.pyc +0 -0
  46. data/python/src/zhon/__pycache__/hanzi.cpython-311.pyc +0 -0
  47. data/python/src/zhon/__pycache__/pinyin.cpython-311.pyc +0 -0
  48. data/python/src/zhon/__pycache__/zhuyin.cpython-311.pyc +0 -0
  49. data/python/src/zhon/cedict/__init__.py +14 -0
  50. data/python/src/zhon/cedict/__pycache__/__init__.cpython-311.pyc +0 -0
  51. data/python/src/zhon/cedict/__pycache__/all.cpython-311.pyc +0 -0
  52. data/python/src/zhon/cedict/__pycache__/simplified.cpython-311.pyc +0 -0
  53. data/python/src/zhon/cedict/__pycache__/traditional.cpython-311.pyc +0 -0
  54. data/python/src/zhon/cedict/all.py +4 -0
  55. data/python/src/zhon/cedict/simplified.py +4 -0
  56. data/python/src/zhon/cedict/traditional.py +4 -0
  57. data/python/src/zhon/hanzi.py +81 -0
  58. data/python/src/zhon/pinyin.py +187 -0
  59. data/python/src/zhon/zhuyin.py +46 -0
  60. metadata +198 -0
@@ -0,0 +1,163 @@
1
+ require "json"
2
+ require "open3"
3
+ require "tempfile"
4
+
5
+ module LocaleDetector
6
+ class Client
7
+ attr_reader :options
8
+
9
+ def initialize(**options)
10
+ @options = default_options.merge(options)
11
+ validate_python_environment
12
+ end
13
+
14
+ # Detect locale for a single text
15
+ def detect(text)
16
+ return nil if text.nil? || text.strip.empty?
17
+
18
+ result = execute_detection(text: text)
19
+ parse_result(result)
20
+ end
21
+
22
+ # Detect locales for multiple texts
23
+ def detect_batch(texts)
24
+ return [] if texts.nil? || texts.empty?
25
+
26
+ texts.map { |text| detect(text) }
27
+ end
28
+
29
+ # Detect locale from file
30
+ def detect_file(file_path)
31
+ raise Error, "File not found: #{file_path}" unless File.exist?(file_path)
32
+
33
+ result = execute_detection(file: file_path)
34
+ parse_result(result)
35
+ end
36
+
37
+ # Get detailed detection information
38
+ def detect_with_details(text)
39
+ return nil if text.nil? || text.strip.empty?
40
+
41
+ result = execute_detection(text: text, details: true)
42
+ parse_detailed_result(result)
43
+ end
44
+
45
+ private
46
+
47
+ def default_options
48
+ {
49
+ mode: "ratio", # conversion, ratio, both
50
+ low_memory: false, # use low memory mode
51
+ output_format: "json", # simple, json, detailed
52
+ encoding: "utf-8", # file encoding
53
+ python_path: find_python_executable,
54
+ cli_path: find_cli_path
55
+ }
56
+ end
57
+
58
+ def find_python_executable
59
+ # Try to find Python executable
60
+ %w[python3 python].each do |cmd|
61
+ _output, status = Open3.capture2e("which #{cmd}")
62
+ return cmd if status.success?
63
+ end
64
+
65
+ raise Error, "Python executable not found. Please install Python 3.11+."
66
+ end
67
+
68
+ def find_cli_path
69
+ # Look for the CLI script in the gem directory
70
+ gem_root = File.expand_path("../..", __dir__)
71
+ cli_candidates = [
72
+ File.join(gem_root, "python", "cli.py"),
73
+ File.join(File.dirname(gem_root), "cli.py"),
74
+ File.join(gem_root, "..", "cli.py") # Parent directory
75
+ ]
76
+
77
+ cli_candidates.each do |path|
78
+ return File.expand_path(path) if File.exist?(path)
79
+ end
80
+
81
+ raise Error, "CLI script not found. Please ensure the Python components are installed."
82
+ end
83
+
84
+ def validate_python_environment
85
+ # Check if Python is available
86
+ output, status = Open3.capture2e("#{@options[:python_path]} --version")
87
+ raise Error, "Python is not available: #{output}" unless status.success?
88
+
89
+ # Check Python version (should be 3.11+)
90
+ version_match = output.match(/Python (\d+\.\d+)/)
91
+ return unless version_match
92
+
93
+ version = version_match[1].to_f
94
+ return unless version < 3.11
95
+
96
+ raise Error, "Python version must be 3.11 or higher. Found: #{version}"
97
+ end
98
+
99
+ def execute_detection(text: nil, file: nil, details: false)
100
+ cmd_args = build_command_args(text: text, file: file, details: details)
101
+
102
+ output, error, status = Open3.capture3(*cmd_args)
103
+
104
+ raise Error, "Detection failed: #{error}" unless status.success?
105
+
106
+ output.strip
107
+ end
108
+
109
+ def build_command_args(text: nil, file: nil, details: false)
110
+ args = [@options[:python_path], @options[:cli_path]]
111
+
112
+ if text
113
+ args << text
114
+ elsif file
115
+ args += ["--file", file]
116
+ end
117
+
118
+ args += ["--mode", @options[:mode]]
119
+ args += ["--output-format", @options[:output_format]]
120
+ args += ["--encoding", @options[:encoding]]
121
+ args << "--low-memory" if @options[:low_memory]
122
+ args << "--details" if details
123
+
124
+ args
125
+ end
126
+
127
+ def parse_result(output)
128
+ case @options[:output_format]
129
+ when "json"
130
+ begin
131
+ JSON.parse(output)
132
+ rescue JSON::ParserError => e
133
+ raise Error, "Failed to parse JSON response: #{e.message}"
134
+ end
135
+ when "detailed"
136
+ parse_detailed_output(output)
137
+ else # "simple" or any other format
138
+ { "locale" => output.strip }
139
+ end
140
+ end
141
+
142
+ def parse_detailed_result(output)
143
+ JSON.parse(output)
144
+ rescue JSON::ParserError => e
145
+ raise Error, "Failed to parse detailed JSON response: #{e.message}"
146
+ end
147
+
148
+ def parse_detailed_output(output)
149
+ # Parse detailed text output format
150
+ lines = output.split("\n")
151
+ result = {}
152
+
153
+ lines.each do |line|
154
+ if line.include?(":")
155
+ key, value = line.split(":", 2)
156
+ result[key.strip.downcase.gsub(" ", "_")] = value.strip
157
+ end
158
+ end
159
+
160
+ result
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,46 @@
1
+ module LocaleDetector
2
+ class Detector
3
+ SUPPORTED_LANGUAGES = %w[
4
+ af am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
5
+ ca cbk ce ceb ckb co cs cv cy da de diq dsb dv dz ee el eml en eo es et
6
+ eu ext fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy
7
+ ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez
8
+ li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn
9
+ nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro
10
+ ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw szl ta te tg th
11
+ tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo
12
+ yue zh zu
13
+ ].freeze
14
+
15
+ CHINESE_VARIANTS = {
16
+ "zh-TW" => "Traditional Chinese (Taiwan)",
17
+ "zh-CN" => "Simplified Chinese (China)",
18
+ "zh-HK" => "Traditional Chinese (Hong Kong)",
19
+ "zh-MO" => "Traditional Chinese (Macau)",
20
+ "zh-SG" => "Simplified Chinese (Singapore)"
21
+ }.freeze
22
+
23
+ DETECTION_MODES = %w[conversion ratio both].freeze
24
+
25
+ def self.supported_languages
26
+ SUPPORTED_LANGUAGES
27
+ end
28
+
29
+ def self.chinese_variants
30
+ CHINESE_VARIANTS
31
+ end
32
+
33
+ def self.detection_modes
34
+ DETECTION_MODES
35
+ end
36
+
37
+ def self.language_supported?(language_code)
38
+ base_language = language_code.split("-").first
39
+ SUPPORTED_LANGUAGES.include?(base_language)
40
+ end
41
+
42
+ def self.chinese_variant?(locale)
43
+ CHINESE_VARIANTS.key?(locale)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,3 @@
1
+ module LocaleDetector
2
+ VERSION = "1.0.1".freeze
3
+ end
@@ -0,0 +1,25 @@
1
+ require_relative "locale_detector/version"
2
+ require_relative "locale_detector/client"
3
+ require_relative "locale_detector/detector"
4
+
5
+ module LocaleDetector
6
+ class Error < StandardError; end
7
+
8
+ # Convenience method for quick detection
9
+ def self.detect(text, **options)
10
+ client = Client.new(**options)
11
+ client.detect(text)
12
+ end
13
+
14
+ # Batch detection
15
+ def self.detect_batch(texts, **options)
16
+ client = Client.new(**options)
17
+ client.detect_batch(texts)
18
+ end
19
+
20
+ # Detect from file
21
+ def self.detect_file(file_path, **options)
22
+ client = Client.new(**options)
23
+ client.detect_file(file_path)
24
+ end
25
+ end
@@ -0,0 +1,46 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "jtcg_locale_detector"
3
+
4
+ spec.summary = "多語言地區檢測工具,特別優化中文繁簡體檢測"
5
+ spec.description = "Multi-language locale detector with specialized Chinese variant detection. " \
6
+ "Uses FastText for initial language identification and multiple algorithms " \
7
+ "for Chinese variant detection. Ruby gem packaged by JTCG Team."
8
+ spec.version = "1.0.1"
9
+ spec.authors = ["JTCG Team"]
10
+ spec.email = ["enor@j-tcg.com"]
11
+
12
+ spec.summary = "多語言地區檢測工具,特別優化中文繁簡體檢測"
13
+ spec.description = "Multi-language locale detector with specialized Chinese variant detection. " \
14
+ "Uses FastText for initial language identification and multiple algorithms " \
15
+ "for Chinese variant detection."
16
+ spec.homepage = "https://github.com/jtcg/locale-detector"
17
+ spec.license = "MIT"
18
+ spec.required_ruby_version = ">= 2.6.0"
19
+
20
+ # 作者角色說明
21
+ spec.metadata["packaged_by"] = "JTCG Team <enor@j-tcg.com>"
22
+ spec.metadata["original_author"] = "Original Author <mark@j-tcg.com>"
23
+ spec.metadata["maintainer"] = "JTCG Team"
24
+
25
+ # Specify which files should be added to the gem when it is released.
26
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
27
+ Dir["{bin,lib,python}/**/*", "*.md", "*.txt", "*.gemspec"]
28
+ end
29
+
30
+ spec.bindir = "bin"
31
+ spec.executables = ["locale-detector"]
32
+ spec.require_paths = ["lib"]
33
+
34
+ # Dependencies
35
+ spec.add_dependency "ffi", "~> 1.15"
36
+ spec.add_dependency "json", "~> 2.6"
37
+
38
+ # Development dependencies
39
+ spec.add_development_dependency "bundler", "~> 1.17"
40
+ spec.add_development_dependency "rake", "~> 13.0"
41
+ spec.add_development_dependency "rspec", "~> 3.12"
42
+ spec.add_development_dependency "rubocop", "~> 1.50"
43
+
44
+ # Metadata
45
+ spec.metadata["rubygems_mfa_required"] = "true"
46
+ end
data/python/cli.py ADDED
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LocaleDetector CLI - 多語言地區檢測命令列工具
4
+
5
+ 支援檢測多種語言,特別針對中文繁簡體變體檢測進行優化。
6
+ 使用 FastText 進行初始語言識別,再對中文進行特化檢測。
7
+ """
8
+
9
+ import argparse
10
+ import asyncio
11
+ import json
12
+ import os
13
+ import sys
14
+
15
+ # 若在 PyInstaller 執行環境,自動將 _internal/src 加入 sys.path
16
+ if hasattr(sys, "_MEIPASS"):
17
+ cython_mod_path = os.path.join(sys._MEIPASS, "src")
18
+ if cython_mod_path not in sys.path:
19
+ sys.path.insert(0, cython_mod_path)
20
+ from pathlib import Path
21
+ from typing import Literal
22
+
23
+ try:
24
+ from src.locale_detector_cy import LocaleDetector
25
+ except ImportError:
26
+ from src.locale_detector import LocaleDetector
27
+
28
+
29
+ def create_parser() -> argparse.ArgumentParser:
30
+ """創建命令列參數解析器"""
31
+ parser = argparse.ArgumentParser(
32
+ description="多語言地區檢測工具 - 特別優化中文繁簡體檢測",
33
+ formatter_class=argparse.RawDescriptionHelpFormatter,
34
+ epilog="""
35
+ 使用範例:
36
+ %(prog)s "Hello world" # 檢測單一文本
37
+ %(prog)s -f input.txt # 從檔案讀取
38
+ %(prog)s "你好世界" --details # 顯示詳細信息
39
+ %(prog)s "中文文本" --mode ratio # 使用比例分析模式
40
+ %(prog)s --batch file1.txt file2.txt # 批次處理多個檔案
41
+ """,
42
+ )
43
+
44
+ # 文本輸入選項
45
+ input_group = parser.add_mutually_exclusive_group(required=True)
46
+ input_group.add_argument("text", nargs="?", help="要檢測的文本內容")
47
+ input_group.add_argument("-f", "--file", type=Path, help="從檔案讀取文本進行檢測")
48
+ input_group.add_argument("--batch", nargs="+", type=Path, help="批次處理多個檔案")
49
+
50
+ # 檢測模式選項
51
+ parser.add_argument(
52
+ "--mode",
53
+ choices=["conversion", "ratio", "both"],
54
+ default="ratio",
55
+ help="中文檢測模式: conversion(快速), ratio(準確), both(混合) [預設: ratio]",
56
+ )
57
+
58
+ # 記憶體選項
59
+ parser.add_argument("--low-memory", action="store_true", help="使用低記憶體模式 (使用較小的 FastText 模型)")
60
+
61
+ # 輸出格式選項
62
+ parser.add_argument(
63
+ "--output-format",
64
+ choices=["simple", "json", "detailed"],
65
+ default="simple",
66
+ help="輸出格式: simple(簡單), json(JSON格式), detailed(詳細信息) [預設: simple]",
67
+ )
68
+
69
+ # 詳細信息選項
70
+ parser.add_argument("--details", action="store_true", help="顯示檢測詳細信息(包含信心分數)")
71
+
72
+ # 輸出檔案選項
73
+ parser.add_argument("-o", "--output", type=Path, help="將結果輸出到檔案")
74
+
75
+ # 編碼選項
76
+ parser.add_argument("--encoding", default="utf-8", help="檔案編碼格式 [預設: utf-8]")
77
+
78
+ return parser
79
+
80
+
81
+ def read_text_file(file_path: Path, encoding: str = "utf-8") -> str:
82
+ """讀取文本檔案"""
83
+ try:
84
+ return file_path.read_text(encoding=encoding).strip()
85
+ except UnicodeDecodeError:
86
+ # 嘗試其他編碼
87
+ for fallback_encoding in ["gbk", "big5", "latin1"]:
88
+ try:
89
+ return file_path.read_text(encoding=fallback_encoding).strip()
90
+ except UnicodeDecodeError:
91
+ continue
92
+ raise ValueError(f"無法解碼檔案 {file_path},請指定正確的編碼格式")
93
+
94
+
95
+ def format_output(result: dict | str, output_format: str, show_details: bool) -> str:
96
+ """格式化輸出結果"""
97
+ if isinstance(result, str):
98
+ # 簡單字串結果
99
+ if output_format == "json":
100
+ return json.dumps({"locale": result}, ensure_ascii=False, indent=2)
101
+ return result
102
+
103
+ # 詳細結果字典
104
+ if output_format == "json":
105
+ return json.dumps(result, ensure_ascii=False, indent=2)
106
+ if output_format == "detailed" or show_details:
107
+ locale = result.get("locale", "unknown")
108
+ language = result.get("language", "unknown")
109
+ score = result.get("score", 0.0)
110
+ return f"地區: {locale} | 語言: {language} | 信心分數: {score:.3f}"
111
+ return result.get("locale", "unknown")
112
+
113
+
114
+ def write_output(content: str, output_file: Path | None, encoding: str = "utf-8") -> None:
115
+ """寫入輸出"""
116
+ if output_file:
117
+ output_file.write_text(content, encoding=encoding)
118
+ print(f"結果已儲存至: {output_file}")
119
+ else:
120
+ print(content)
121
+
122
+
123
+ async def process_single_text(
124
+ detector: LocaleDetector, text: str, mode: Literal["conversion", "ratio", "both"], show_details: bool
125
+ ) -> dict | str:
126
+ """處理單一文本"""
127
+ if show_details:
128
+ return await detector.adetect_with_details(text, mode=mode)
129
+ return await detector.adetect(text)
130
+
131
+
132
+ async def process_batch(
133
+ detector: LocaleDetector, file_paths: list[Path], mode: Literal["conversion", "ratio", "both"], encoding: str, show_details: bool
134
+ ) -> list[dict]:
135
+ """批次處理多個檔案"""
136
+ results = []
137
+
138
+ for file_path in file_paths:
139
+ try:
140
+ text = read_text_file(file_path, encoding)
141
+ if show_details:
142
+ result = await detector.adetect_with_details(text, mode=mode)
143
+ result["file"] = str(file_path)
144
+ else:
145
+ locale = await detector.adetect(text)
146
+ result = {"file": str(file_path), "locale": locale}
147
+ results.append(result)
148
+ except Exception as e:
149
+ error_result = {"file": str(file_path), "error": str(e), "locale": "error"}
150
+ results.append(error_result)
151
+
152
+ return results
153
+
154
+
155
+ async def main() -> None:
156
+ """主函數"""
157
+ parser = create_parser()
158
+ args = parser.parse_args()
159
+
160
+ try:
161
+ # 初始化檢測器
162
+ print("正在初始化 LocaleDetector...", file=sys.stderr)
163
+ detector = LocaleDetector(low_memory=args.low_memory)
164
+
165
+ # 處理輸入
166
+ if args.batch:
167
+ # 批次處理
168
+ print(f"正在處理 {len(args.batch)} 個檔案...", file=sys.stderr)
169
+ results = await process_batch(detector, args.batch, args.mode, args.encoding, args.details)
170
+
171
+ if args.output_format == "json":
172
+ output = json.dumps(results, ensure_ascii=False, indent=2)
173
+ else:
174
+ output_lines = []
175
+ for result in results:
176
+ if "error" in result:
177
+ line = f"{result['file']}: ERROR - {result['error']}"
178
+ else:
179
+ formatted = format_output(result, args.output_format, args.details)
180
+ if args.details or args.output_format == "detailed":
181
+ line = f"{result['file']}: {formatted}"
182
+ else:
183
+ line = f"{result['file']}: {result['locale']}"
184
+ output_lines.append(line)
185
+ output = "\n".join(output_lines)
186
+
187
+ else:
188
+ # 單一文本處理
189
+ text = read_text_file(args.file, args.encoding) if args.file else args.text
190
+
191
+ if not text.strip():
192
+ print("錯誤: 輸入文本為空", file=sys.stderr)
193
+ sys.exit(1)
194
+
195
+ print("正在檢測語言...", file=sys.stderr)
196
+ result = await process_single_text(detector, text, args.mode, args.details)
197
+ output = format_output(result, args.output_format, args.details)
198
+
199
+ # 輸出結果
200
+ write_output(output, args.output, args.encoding)
201
+
202
+ except KeyboardInterrupt:
203
+ print("\n程式被使用者中斷", file=sys.stderr)
204
+ sys.exit(1)
205
+ except Exception as e:
206
+ print(f"錯誤: {e}", file=sys.stderr)
207
+ sys.exit(1)
208
+
209
+
210
+ def cli_entry_point() -> None:
211
+ """CLI 入口點,用於打包後的執行檔"""
212
+ try:
213
+ asyncio.run(main())
214
+ except RuntimeError:
215
+ loop = asyncio.get_event_loop()
216
+ loop.run_until_complete(main())
217
+
218
+
219
+ if __name__ == "__main__":
220
+ cli_entry_point()
@@ -0,0 +1,8 @@
1
+ fasttext-wheel>=0.9.2
2
+ opencc-python-reimplemented>=0.1.7
3
+ zhon>=2.0.2
4
+ requests>=2.31.0
5
+ tqdm>=4.66.0
6
+ numpy<2.0.0
7
+ nest-asyncio>=1.5.6
8
+ psutil>=5.9.0
@@ -0,0 +1,10 @@
1
+ """
2
+ Locale Detector Package
3
+
4
+ Multi-language locale detector with specialized Chinese variant detection.
5
+ """
6
+
7
+ from .locale_detector import LocaleDetector
8
+
9
+ __version__ = "1.0.0"
10
+ __all__ = ["LocaleDetector"]
@@ -0,0 +1,69 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ import requests
5
+ from tqdm import tqdm
6
+
7
+
8
+ def download_file(url: str, target_path: Path, desc: str):
9
+ """
10
+ Download a file with progress bar
11
+ """
12
+ response = requests.get(url, stream=True)
13
+ total_size = int(response.headers.get("content-length", 0))
14
+
15
+ # Ensure the directory exists
16
+ target_path.parent.mkdir(parents=True, exist_ok=True)
17
+
18
+ # Show download progress
19
+ with (
20
+ open(target_path, "wb") as file,
21
+ tqdm(
22
+ desc=desc,
23
+ total=total_size,
24
+ unit="iB",
25
+ unit_scale=True,
26
+ unit_divisor=1024,
27
+ ) as pbar,
28
+ ):
29
+ for data in response.iter_content(chunk_size=1024):
30
+ size = file.write(data)
31
+ pbar.update(size)
32
+
33
+
34
+ def main():
35
+ # Define the target directory
36
+ current_dir = Path(__file__).resolve().parents[2]
37
+ target_dir = current_dir / "tools" / "locales" / "artifacts" / "fasttext"
38
+
39
+ # Model URLs and filenames
40
+ models = {
41
+ "lid.176.ftz": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz",
42
+ "lid.176.bin": "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
43
+ }
44
+
45
+ print(f"Downloading FastText models to: {target_dir}")
46
+
47
+ try:
48
+ # Download each model
49
+ for filename, url in models.items():
50
+ target_path = target_dir / filename
51
+
52
+ if target_path.exists():
53
+ print(f"Model {filename} already exists, skipping...")
54
+ continue
55
+
56
+ print(f"Downloading {filename}...")
57
+ download_file(url, target_path, f"Downloading {filename}")
58
+ print(f"Successfully downloaded {filename}")
59
+
60
+ print("\nAll models downloaded successfully!")
61
+ print(f"Models are located in: {target_dir}")
62
+
63
+ except Exception as e:
64
+ print(f"Error downloading models: {e!s}", file=sys.stderr)
65
+ sys.exit(1)
66
+
67
+
68
+ if __name__ == "__main__":
69
+ main()