MLATE 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__init__.py ADDED
@@ -0,0 +1 @@
1
+ """MLATE — Multi-dimensional Literature Analysis and Thematic Exploration"""
__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
cli.py ADDED
@@ -0,0 +1,99 @@
1
+ """MLATE CLI 入口"""
2
+ import argparse
3
+ from pipeline import MLATEPipeline
4
+ import config
5
+ from logger import logger
6
+
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser(prog="mlate", description="Multi-dimensional Literature Analysis and Thematic Exploration")
10
+ g = parser.add_argument_group("全局选项")
11
+ g.add_argument("--api-base", help="API Base URL")
12
+ g.add_argument("--model", default=None, help="模型名(默认按配置文件/环境变量/模型推断)")
13
+ g.add_argument("--workers", type=int, default=8)
14
+ g.add_argument("--qps", type=float, default=2.0)
15
+ g.add_argument("--lang", choices=["cn", "en", "both"], help="Output language (cn/en/both)")
16
+ g.add_argument("--source", choices=["auto", "wos", "scopus", "standard"], default="auto",
17
+ help="Data source type (auto-detect, wos, scopus, standard)")
18
+
19
+ sub = parser.add_subparsers(dest="command", required=True)
20
+
21
+ # ── config ──
22
+ pc = sub.add_parser("config", help="管理配置(API Key / Base URL / Model)")
23
+ pc.add_argument("action", choices=["init", "set", "get", "show"],
24
+ help="init=初始化 | set=设置值 key value | get=读取值 key | show=查看全部")
25
+ pc.add_argument("args", nargs="*", help="set 需要 key value;get 需要 key")
26
+
27
+ # ── filter ──
28
+ pf = sub.add_parser("filter", help="主题筛选:LLM 按用户主题评分排名,筛选 ≥ min-score")
29
+ pf.add_argument("--input", required=True)
30
+ pf.add_argument("--output", required=True)
31
+ pf.add_argument("--topic", required=True, help="筛选主题,如 'ship digital twin propulsion'")
32
+ pf.add_argument("--min-score", type=float, default=3.0, help="最低评分(1-5),默认 3.0")
33
+ pf.add_argument("--output-lang", help="LLM 理由与标准的输出语言(默认读取配置 output_lang)")
34
+
35
+ # ── explore ──
36
+ pe = sub.add_parser("explore", help="主题探索:第一阶段(自发发现草案 - 逐篇分析)")
37
+ pe.add_argument("--input", required=True, help="筛选后的 CSV 输入")
38
+ pe.add_argument("--output", required=True, help="输出原始草案 JSON 路径")
39
+ pe.add_argument("--max-papers", type=int)
40
+ pe.add_argument("--dims", help="初始维度(逗号分隔)")
41
+ pe.add_argument("--guide", help="研究者引导词")
42
+ pe.add_argument("--output-lang", help="LLM 发现结果的输出语言(默认读取配置 output_lang)")
43
+
44
+ # ── converge ──
45
+ pcv = sub.add_parser("converge", help="主题探索:第二阶段(智能收敛与总结)")
46
+ pcv.add_argument("--input", required=True, help="explore 产出的原始草案 JSON")
47
+ pcv.add_argument("--output", required=True, help="输出最终收敛后的 taxonomy JSON")
48
+ pcv.add_argument("--output-csv", help="输出带标签的文献 CSV 路径")
49
+ pcv.add_argument("--limit-cats", type=int, default=10, help="每个维度保留的分类上限")
50
+ pcv.add_argument("--dims", help="指定收敛的维度(默认全量)")
51
+ pcv.add_argument("--guide", help="收敛阶段的引导词")
52
+ pcv.add_argument("--source-csv", help="可选:原始 CSV 路径(用于将 ID 映射回论文标题)")
53
+ pcv.add_argument("--output-lang", help="LLM 收敛定义的输出语言(默认读取配置 output_lang)")
54
+
55
+ # ── translate ──
56
+ ptr = sub.add_parser("translate", help="自动翻译:对 CSV 列或 JSON 内容进行 LLM 翻译")
57
+ ptr.add_argument("--input", required=True)
58
+ ptr.add_argument("--output", required=True)
59
+ ptr.add_argument("--lang", default="中文", help="目标语言(默认中文)")
60
+ ptr.add_argument("--cols", help="如果是 CSV,指定需要翻译的列名(逗号分隔)")
61
+
62
+ args = parser.parse_args()
63
+
64
+ # Set language: Priority CLI > Config > Env (Default: en)
65
+ cfg = config.load()
66
+ lang = args.lang or cfg.get("lang")
67
+ if lang:
68
+ logger.set_lang(lang)
69
+
70
+ # Resolve output language: CLI > Config > Default "中文"
71
+ output_lang = getattr(args, "output_lang", None) or cfg.get("output_lang") or "中文"
72
+
73
+ # 处理 config 子命令(不需要 pipeline)
74
+ if args.command == "config":
75
+ # 重新配置 argparse 以支持 config get <key>
76
+ config.cmd_config(args)
77
+ return
78
+
79
+ pipe = MLATEPipeline(
80
+ model=args.model, api_base=args.api_base,
81
+ max_workers=args.workers, qps=args.qps,
82
+ )
83
+
84
+ if args.command == "filter":
85
+ pipe.filter(args.input, args.output, topic=args.topic, min_score=args.min_score, source_type=args.source, language=output_lang)
86
+ elif args.command == "explore":
87
+ pipe.explore(args.input, args.output, args.max_papers,
88
+ source_type=args.source, researcher_guide=args.guide,
89
+ initial_dims=args.dims, language=output_lang)
90
+ elif args.command == "converge":
91
+ pipe.converge(args.input, args.output, limit_cats=args.limit_cats,
92
+ target_dims=args.dims, researcher_guide=args.guide,
93
+ source_csv=args.source_csv, output_csv=args.output_csv, language=output_lang)
94
+ elif args.command == "translate":
95
+ pipe.translate(args.input, args.output, target_lang=args.lang, columns=args.cols)
96
+
97
+
98
+ if __name__ == "__main__":
99
+ main()
config.py ADDED
@@ -0,0 +1,172 @@
1
+ """Configuration management: ~/.mlate/config.json + Env var auto-detection
2
+
3
+ Security Design:
4
+ - API Keys are NEVER stored on disk.
5
+ - API Keys must be provided via environment variables (MLATE_API_KEY) or CLI arguments.
6
+ - Base URL and Model are non-sensitive and can be persisted safely.
7
+ """
8
+ import os, json, sys, shutil
9
+ from pathlib import Path
10
+ from logger import logger
11
+
12
+ CONFIG_DIR = Path.home() / ".mlate"
13
+ CONFIG_FILE = CONFIG_DIR / "config.json"
14
+ # Priority for env vars
15
+ ENV_KEY_PRIORITY = ["MLATE_API_KEY", "LLM_API_KEY", "OPENAI_API_KEY", "DEEPSEEK_API_KEY"]
16
+
17
+
18
+ def _ensure_dir():
19
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
20
+ if sys.platform != "win32":
21
+ CONFIG_DIR.chmod(0o700)
22
+
23
+
24
+ def load() -> dict:
25
+ if CONFIG_FILE.exists():
26
+ try:
27
+ with open(CONFIG_FILE) as f:
28
+ return json.load(f)
29
+ except (json.JSONDecodeError, OSError):
30
+ pass
31
+ return {}
32
+
33
+
34
+ def save(cfg: dict):
35
+ _ensure_dir()
36
+ tmp = CONFIG_FILE.with_suffix(".tmp")
37
+ with open(tmp, "w") as f:
38
+ json.dump(cfg, f, indent=2)
39
+ if sys.platform != "win32":
40
+ tmp.chmod(0o600)
41
+ tmp.rename(CONFIG_FILE)
42
+
43
+
44
+ def mask_key(key: str) -> str:
45
+ if len(key) <= 12:
46
+ return "****"
47
+ return key[:8] + "…" + key[-4:]
48
+
49
+
50
+ def resolve_api_key() -> str:
51
+ """Resolution: Env vars only. (Security policy: No CLI or Config file storage)"""
52
+ for var in ENV_KEY_PRIORITY:
53
+ v = os.environ.get(var)
54
+ if v:
55
+ return v
56
+ return ""
57
+
58
+
59
+ def resolve_base_url(model: str, cli_value: str | None = None) -> str:
60
+ if cli_value:
61
+ return cli_value
62
+ env = os.environ.get("MLATE_API_BASE")
63
+ if env:
64
+ return env
65
+ cfg = load()
66
+ cfg_base = cfg.get("api_base")
67
+ if cfg_base:
68
+ return cfg_base
69
+ m = model.lower()
70
+ if any(k in m for k in ["deepseek"]):
71
+ return "https://api.deepseek.com"
72
+ if any(k in m for k in ["gpt", "o1", "o3"]):
73
+ return "https://api.openai.com/v1"
74
+ return "https://api.deepseek.com"
75
+
76
+
77
+ def resolve_model(cli_value: str | None = None) -> str:
78
+ if cli_value:
79
+ return cli_value
80
+ env = os.environ.get("MLATE_MODEL")
81
+ if env:
82
+ return env
83
+ cfg = load()
84
+ return cfg.get("model", "deepseek-chat")
85
+
86
+
87
+ # ── CLI 交互 ──────────────────────────────────────────────────
88
+ def _print_setup_guide():
89
+ logger.info("")
90
+ logger.info("=" * 54)
91
+ logger.info(" API Key not set. Please use environment variables:", " API Key 未设置,请通过环境变量配置:")
92
+ logger.info("")
93
+ logger.info(" Set environment variable (Recommended):", " 设置环境变量(推荐):")
94
+ logger.info(" $env:MLATE_API_KEY = \"sk-xxx\" # Windows PowerShell")
95
+ logger.info(' export MLATE_API_KEY="sk-xxx" # Linux/macOS')
96
+ logger.info("=" * 54)
97
+
98
+
99
+ def cmd_config(args):
100
+ action = args.action
101
+ rest = args.args
102
+
103
+ if action == "init":
104
+ _ensure_dir()
105
+ if not CONFIG_FILE.exists():
106
+ save({})
107
+ logger.success(f"Created {CONFIG_FILE}", f"已创建 {CONFIG_FILE}")
108
+ else:
109
+ logger.info(f"Already exists: {CONFIG_FILE}", f"已存在: {CONFIG_FILE}")
110
+
111
+ elif action == "set":
112
+ if len(rest) < 2:
113
+ logger.info("Usage: mlate config set <key> <value>", "用法: mlate config set <key> <value>")
114
+ logger.info(" Keys: api_base, model, lang", " 可用键: api_base, model, lang")
115
+ return
116
+ key, value = rest[0], rest[1]
117
+ if key == "api_key":
118
+ logger.error("Security policy: api_key cannot be stored in config file.",
119
+ "安全策略:api_key 不支持存储在配置文件中。")
120
+ logger.info("Please use environment variable MLATE_API_KEY instead.",
121
+ "请改用环境变量 MLATE_API_KEY。")
122
+ return
123
+ cfg = load()
124
+ cfg[key] = value
125
+ save(cfg)
126
+ logger.success(f"Successfully set {key}", f"已设置 {key}")
127
+
128
+ elif action == "get":
129
+ if not rest:
130
+ logger.info("Usage: mlate config get <key>", "用法: mlate config get <key>")
131
+ logger.info(" Keys: api_base, model, lang", " 可用键: api_base, model, lang")
132
+ return
133
+ key = rest[0]
134
+ if key == "api_key":
135
+ print(resolve_api_key())
136
+ return
137
+ cfg = load()
138
+ print(cfg.get(key, ""))
139
+
140
+ elif action == "show":
141
+ logger.info(f"Config file: {CONFIG_FILE}", f"配置文件: {CONFIG_FILE}")
142
+ logger.info(f"Env variable: MLATE_API_KEY", f"环境变量: MLATE_API_KEY")
143
+ logger.info("")
144
+
145
+ cfg = load()
146
+ model = resolve_model()
147
+ base = resolve_base_url(model)
148
+ key = resolve_api_key()
149
+
150
+ key_source_cn = "环境变量" if key else "无"
151
+ key_source_en = "Env Var" if key else "None"
152
+
153
+ logger.info("Current effective configuration:", "当前生效配置:")
154
+ logger.info(f" model = {model}")
155
+ logger.info(f" api_base = {base}")
156
+ logger.info(f" log_lang = {cfg.get('lang', 'en')}")
157
+ logger.info(f" output_lang = {cfg.get('output_lang', '中文')}")
158
+ logger.info("")
159
+
160
+ if "api_key" in cfg:
161
+ del cfg["api_key"] # Clean up legacy key if exists
162
+ save(cfg)
163
+
164
+ if cfg:
165
+ logger.info(f"Config file content ({len(cfg)} items):", f"配置文件内容 ({len(cfg)} 项):")
166
+ for k, v in cfg.items():
167
+ logger.info(f" {k} = {v}")
168
+ else:
169
+ logger.info("Config file is empty", "配置文件为空")
170
+
171
+ if not key:
172
+ _print_setup_guide()
llm.py ADDED
@@ -0,0 +1,79 @@
1
+ """LLM 调用封装 + RateLimiter
2
+
3
+ API Key 与 Base URL 解析委托给 config 模块,实现:
4
+ CLI 参数 > 环境变量 > 配置文件 > 模型名推断
5
+ """
6
+ import os, time, threading, json
7
+ from openai import OpenAI
8
+ import config
9
+ from logger import logger
10
+
11
+ __all__ = ["RateLimiter", "LLM"]
12
+
13
+
14
+ class RateLimiter:
15
+ def __init__(self, qps: float = 2.0):
16
+ self.qps = float(qps)
17
+ self.min_interval = 1.0 / self.qps if self.qps > 0 else 0
18
+ self._lock = threading.Lock()
19
+ self._next_time = 0.0
20
+
21
+ def wait(self):
22
+ if self.min_interval <= 0:
23
+ return
24
+ with self._lock:
25
+ now = time.time()
26
+ if now < self._next_time:
27
+ time.sleep(self._next_time - now)
28
+ self._next_time = max(now, self._next_time) + self.min_interval
29
+
30
+
31
+ class LLM:
32
+ def __init__(self, model: str = None, api_base: str = None):
33
+ self.model = config.resolve_model(model)
34
+ self.api_key = config.resolve_api_key()
35
+ self.api_base = config.resolve_base_url(self.model, api_base)
36
+
37
+ if not self.api_key:
38
+ logger.error("API Key not set. Please configure via environment variables:", "API Key 未设置。请通过环境变量配置:")
39
+ logger.info(" $env:MLATE_API_KEY = \"sk-xxx\" # Windows PowerShell")
40
+ logger.info(' export MLATE_API_KEY="sk-xxx" # Linux/macOS')
41
+ raise ValueError("MLATE_API_KEY not set")
42
+
43
+ self.client = OpenAI(base_url=self.api_base, api_key=self.api_key)
44
+
45
+ def chat(self, messages: list, temperature: float = 0.2, retries: int = 2) -> str | None:
46
+ """Standard chat completion for plain text"""
47
+ last_err = None
48
+ for attempt in range(retries + 1):
49
+ try:
50
+ resp = self.client.chat.completions.create(
51
+ model=self.model,
52
+ messages=messages,
53
+ temperature=temperature,
54
+ )
55
+ return resp.choices[0].message.content
56
+ except Exception as e:
57
+ last_err = e
58
+ if attempt < retries:
59
+ time.sleep(1)
60
+ logger.error(f"LLM 调用失败: {last_err}", f"LLM call failed: {last_err}")
61
+ return None
62
+
63
+ def chat_json(self, messages: list, temperature: float = 0.2, retries: int = 2) -> dict | None:
64
+ last_err = None
65
+ for attempt in range(retries + 1):
66
+ try:
67
+ resp = self.client.chat.completions.create(
68
+ model=self.model,
69
+ messages=messages,
70
+ response_format={"type": "json_object"},
71
+ temperature=temperature,
72
+ )
73
+ return json.loads(resp.choices[0].message.content)
74
+ except Exception as e:
75
+ last_err = e
76
+ if attempt < retries:
77
+ time.sleep(1)
78
+ logger.error(f"LLM 调用失败: {last_err}", f"LLM call failed: {last_err}")
79
+ return None
loaders.py ADDED
@@ -0,0 +1,108 @@
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from logger import logger
4
+
5
+ class BaseLoader:
6
+ """Base class for data loaders."""
7
+
8
+ # Common column mappings for different sources
9
+ MAPPINGS = {
10
+ "wos": {
11
+ "title": ["TI", "Article Title", "Title"],
12
+ "abstract": ["AB", "Abstract"],
13
+ "keywords": ["DE", "Author Keywords", "ID", "Keywords Plus", "Keywords"]
14
+ },
15
+ "scopus": {
16
+ "title": ["Title"],
17
+ "abstract": ["Abstract"],
18
+ "keywords": ["Author Keywords", "Index Keywords"]
19
+ },
20
+ "standard": {
21
+ "title": ["Title", "题目", "标题"],
22
+ "abstract": ["Abstract", "摘要"],
23
+ "keywords": ["Keywords", "Author Keywords", "关键词"]
24
+ }
25
+ }
26
+
27
+ @staticmethod
28
+ def load(file_path: str, source_type: str = "auto") -> tuple[pd.DataFrame, dict]:
29
+ """Load literature data and return (df, mapping)."""
30
+ path = Path(file_path)
31
+ ext = path.suffix.lower()
32
+
33
+ # 1. Load the raw data
34
+ try:
35
+ if ext == ".csv":
36
+ df = pd.read_csv(file_path, encoding="utf-8-sig")
37
+ elif ext in [".xls", ".xlsx"]:
38
+ df = pd.read_excel(file_path)
39
+ elif ext in [".txt", ".tsv"]:
40
+ df = pd.read_csv(file_path, sep="\t", encoding="utf-8-sig")
41
+ else:
42
+ raise ValueError(f"Unsupported file format: {ext}")
43
+ except Exception as e:
44
+ logger.error(f"Failed to load file {file_path}: {e}")
45
+ raise
46
+
47
+ # 2. Adaptive column mapping (without renaming)
48
+ return BaseLoader.identify_columns(df, source_type)
49
+
50
+ @staticmethod
51
+ def identify_columns(df: pd.DataFrame, source_type: str = "auto") -> tuple[pd.DataFrame, dict]:
52
+ """Identify key columns and return (df_cleaned, mapping)."""
53
+ # Deduplicate columns first
54
+ df = df.loc[:, ~df.columns.duplicated()]
55
+ columns = df.columns.tolist()
56
+
57
+ mapping_to_use = {}
58
+ if source_type != "auto" and source_type in BaseLoader.MAPPINGS:
59
+ mapping_to_use = BaseLoader.MAPPINGS[source_type]
60
+ else:
61
+ best_score = -1
62
+ best_source = "standard"
63
+ for src, maps in BaseLoader.MAPPINGS.items():
64
+ score = sum(1 for target_list in maps.values()
65
+ for col in target_list if col in columns)
66
+ if score > best_score:
67
+ best_score = score
68
+ best_source = src
69
+ mapping_to_use = BaseLoader.MAPPINGS[best_source]
70
+ logger.info(f"Detected source type: {best_source}")
71
+
72
+ # Final mapping of original column names
73
+ res_mapping = {"title": "Title", "abstract": "Abstract", "keywords": "Keywords"}
74
+ found = {"title": False, "abstract": False, "keywords": False}
75
+
76
+ # 1. Try predefined mappings
77
+ for internal_name, candidates in mapping_to_use.items():
78
+ for cand in candidates:
79
+ if cand in columns:
80
+ res_mapping[internal_name] = cand
81
+ found[internal_name] = True
82
+ break
83
+
84
+ # 2. Heuristic fallback
85
+ if not found["title"]:
86
+ for col in columns:
87
+ if any(kw in col.lower() for kw in ["title", "ti"]):
88
+ res_mapping["title"] = col
89
+ found["title"] = True
90
+ break
91
+
92
+ if not found["abstract"]:
93
+ for col in columns:
94
+ if any(kw in col.lower() for kw in ["abstract", "ab", "summary"]):
95
+ res_mapping["abstract"] = col
96
+ found["abstract"] = True
97
+ break
98
+
99
+ if not found["title"]:
100
+ logger.warning("Could not find Title column automatically.")
101
+
102
+ # Log the identified mappings
103
+ logger.section("列名映射确认", "Column Mapping Confirmation")
104
+ logger.info(f" 题目 (Title) : {res_mapping['title']}")
105
+ logger.info(f" 摘要 (Abstract) : {res_mapping['abstract']}")
106
+ logger.info(f" 关键词 (Keywords): {res_mapping['keywords']}")
107
+
108
+ return df, res_mapping
logger.py ADDED
@@ -0,0 +1,70 @@
1
+ import logging
2
+ import sys
3
+ import os
4
+
5
+ class MLATELogger:
6
+ """Logger supporting English (default), Chinese, and bilingual output."""
7
+
8
+ def __init__(self):
9
+ self._logger = logging.getLogger("mlate")
10
+ self._logger.setLevel(logging.INFO)
11
+
12
+ # Default output to stdout
13
+ self._handler = logging.StreamHandler(sys.stdout)
14
+ self._formatter = logging.Formatter("%(message)s")
15
+ self._handler.setFormatter(self._formatter)
16
+ self._logger.addHandler(self._handler)
17
+
18
+ # Language settings: 'cn', 'en', 'both'
19
+ # Priority: Env var MLATE_LANG > Default (en)
20
+ self.lang = os.environ.get("MLATE_LANG", "en").lower()
21
+
22
+ def set_level(self, level):
23
+ """Set logging level."""
24
+ self._logger.setLevel(level)
25
+
26
+ def set_lang(self, lang: str):
27
+ """Set output language: 'cn', 'en', 'both'"""
28
+ if lang in ["cn", "en", "both"]:
29
+ self.lang = lang
30
+
31
+ def _format(self, cn: str, en: str = None) -> str:
32
+ """Format message based on language settings."""
33
+ if en is None:
34
+ return cn
35
+
36
+ if self.lang == "cn":
37
+ return cn
38
+ elif self.lang == "en":
39
+ return en
40
+ else:
41
+ # Bilingual mode
42
+ return f"{cn} | {en}"
43
+
44
+ def info(self, cn: str, en: str = None):
45
+ """Log info message."""
46
+ self._logger.info(self._format(cn, en))
47
+
48
+ def error(self, cn: str, en: str = None):
49
+ """Log error message."""
50
+ self._logger.error(self._format(cn, en))
51
+
52
+ def warning(self, cn: str, en: str = None):
53
+ """Log warning message."""
54
+ self._logger.warning(self._format(cn, en))
55
+
56
+ def success(self, cn: str, en: str = None):
57
+ """Log success message with a checkmark."""
58
+ self._logger.info(f"✓ {self._format(cn, en)}")
59
+
60
+ def section(self, cn: str, en: str = None):
61
+ """Log section header."""
62
+ msg = self._format(cn, en)
63
+ self._logger.info(f"\n── {msg} ──")
64
+
65
+ def separator(self):
66
+ """Log separator line."""
67
+ self._logger.info("-" * 50)
68
+
69
+ # 全局单例
70
+ logger = MLATELogger()