MLATE 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +1 -0
- __main__.py +4 -0
- cli.py +99 -0
- config.py +172 -0
- llm.py +79 -0
- loaders.py +108 -0
- logger.py +70 -0
- mlate-0.1.0.dist-info/METADATA +147 -0
- mlate-0.1.0.dist-info/RECORD +18 -0
- mlate-0.1.0.dist-info/WHEEL +5 -0
- mlate-0.1.0.dist-info/entry_points.txt +2 -0
- mlate-0.1.0.dist-info/licenses/LICENSE +674 -0
- mlate-0.1.0.dist-info/top_level.txt +10 -0
- pipeline.py +328 -0
- stages/__init__.py +0 -0
- stages/explore_stage.py +137 -0
- stages/filter_stage.py +86 -0
- utils.py +54 -0
__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""MLATE — Multi-dimensional Literature Analysis and Thematic Exploration"""
|
__main__.py
ADDED
cli.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""MLATE CLI 入口"""
|
|
2
|
+
import argparse
|
|
3
|
+
from pipeline import MLATEPipeline
|
|
4
|
+
import config
|
|
5
|
+
from logger import logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
parser = argparse.ArgumentParser(prog="mlate", description="Multi-dimensional Literature Analysis and Thematic Exploration")
|
|
10
|
+
g = parser.add_argument_group("全局选项")
|
|
11
|
+
g.add_argument("--api-base", help="API Base URL")
|
|
12
|
+
g.add_argument("--model", default=None, help="模型名(默认按配置文件/环境变量/模型推断)")
|
|
13
|
+
g.add_argument("--workers", type=int, default=8)
|
|
14
|
+
g.add_argument("--qps", type=float, default=2.0)
|
|
15
|
+
g.add_argument("--lang", choices=["cn", "en", "both"], help="Output language (cn/en/both)")
|
|
16
|
+
g.add_argument("--source", choices=["auto", "wos", "scopus", "standard"], default="auto",
|
|
17
|
+
help="Data source type (auto-detect, wos, scopus, standard)")
|
|
18
|
+
|
|
19
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
20
|
+
|
|
21
|
+
# ── config ──
|
|
22
|
+
pc = sub.add_parser("config", help="管理配置(API Key / Base URL / Model)")
|
|
23
|
+
pc.add_argument("action", choices=["init", "set", "get", "show"],
|
|
24
|
+
help="init=初始化 | set=设置值 key value | get=读取值 key | show=查看全部")
|
|
25
|
+
pc.add_argument("args", nargs="*", help="set 需要 key value;get 需要 key")
|
|
26
|
+
|
|
27
|
+
# ── filter ──
|
|
28
|
+
pf = sub.add_parser("filter", help="主题筛选:LLM 按用户主题评分排名,筛选 ≥ min-score")
|
|
29
|
+
pf.add_argument("--input", required=True)
|
|
30
|
+
pf.add_argument("--output", required=True)
|
|
31
|
+
pf.add_argument("--topic", required=True, help="筛选主题,如 'ship digital twin propulsion'")
|
|
32
|
+
pf.add_argument("--min-score", type=float, default=3.0, help="最低评分(1-5),默认 3.0")
|
|
33
|
+
pf.add_argument("--output-lang", help="LLM 理由与标准的输出语言(默认读取配置 output_lang)")
|
|
34
|
+
|
|
35
|
+
# ── explore ──
|
|
36
|
+
pe = sub.add_parser("explore", help="主题探索:第一阶段(自发发现草案 - 逐篇分析)")
|
|
37
|
+
pe.add_argument("--input", required=True, help="筛选后的 CSV 输入")
|
|
38
|
+
pe.add_argument("--output", required=True, help="输出原始草案 JSON 路径")
|
|
39
|
+
pe.add_argument("--max-papers", type=int)
|
|
40
|
+
pe.add_argument("--dims", help="初始维度(逗号分隔)")
|
|
41
|
+
pe.add_argument("--guide", help="研究者引导词")
|
|
42
|
+
pe.add_argument("--output-lang", help="LLM 发现结果的输出语言(默认读取配置 output_lang)")
|
|
43
|
+
|
|
44
|
+
# ── converge ──
|
|
45
|
+
pcv = sub.add_parser("converge", help="主题探索:第二阶段(智能收敛与总结)")
|
|
46
|
+
pcv.add_argument("--input", required=True, help="explore 产出的原始草案 JSON")
|
|
47
|
+
pcv.add_argument("--output", required=True, help="输出最终收敛后的 taxonomy JSON")
|
|
48
|
+
pcv.add_argument("--output-csv", help="输出带标签的文献 CSV 路径")
|
|
49
|
+
pcv.add_argument("--limit-cats", type=int, default=10, help="每个维度保留的分类上限")
|
|
50
|
+
pcv.add_argument("--dims", help="指定收敛的维度(默认全量)")
|
|
51
|
+
pcv.add_argument("--guide", help="收敛阶段的引导词")
|
|
52
|
+
pcv.add_argument("--source-csv", help="可选:原始 CSV 路径(用于将 ID 映射回论文标题)")
|
|
53
|
+
pcv.add_argument("--output-lang", help="LLM 收敛定义的输出语言(默认读取配置 output_lang)")
|
|
54
|
+
|
|
55
|
+
# ── translate ──
|
|
56
|
+
ptr = sub.add_parser("translate", help="自动翻译:对 CSV 列或 JSON 内容进行 LLM 翻译")
|
|
57
|
+
ptr.add_argument("--input", required=True)
|
|
58
|
+
ptr.add_argument("--output", required=True)
|
|
59
|
+
ptr.add_argument("--lang", default="中文", help="目标语言(默认中文)")
|
|
60
|
+
ptr.add_argument("--cols", help="如果是 CSV,指定需要翻译的列名(逗号分隔)")
|
|
61
|
+
|
|
62
|
+
args = parser.parse_args()
|
|
63
|
+
|
|
64
|
+
# Set language: Priority CLI > Config > Env (Default: en)
|
|
65
|
+
cfg = config.load()
|
|
66
|
+
lang = args.lang or cfg.get("lang")
|
|
67
|
+
if lang:
|
|
68
|
+
logger.set_lang(lang)
|
|
69
|
+
|
|
70
|
+
# Resolve output language: CLI > Config > Default "中文"
|
|
71
|
+
output_lang = getattr(args, "output_lang", None) or cfg.get("output_lang") or "中文"
|
|
72
|
+
|
|
73
|
+
# 处理 config 子命令(不需要 pipeline)
|
|
74
|
+
if args.command == "config":
|
|
75
|
+
# 重新配置 argparse 以支持 config get <key>
|
|
76
|
+
config.cmd_config(args)
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
pipe = MLATEPipeline(
|
|
80
|
+
model=args.model, api_base=args.api_base,
|
|
81
|
+
max_workers=args.workers, qps=args.qps,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if args.command == "filter":
|
|
85
|
+
pipe.filter(args.input, args.output, topic=args.topic, min_score=args.min_score, source_type=args.source, language=output_lang)
|
|
86
|
+
elif args.command == "explore":
|
|
87
|
+
pipe.explore(args.input, args.output, args.max_papers,
|
|
88
|
+
source_type=args.source, researcher_guide=args.guide,
|
|
89
|
+
initial_dims=args.dims, language=output_lang)
|
|
90
|
+
elif args.command == "converge":
|
|
91
|
+
pipe.converge(args.input, args.output, limit_cats=args.limit_cats,
|
|
92
|
+
target_dims=args.dims, researcher_guide=args.guide,
|
|
93
|
+
source_csv=args.source_csv, output_csv=args.output_csv, language=output_lang)
|
|
94
|
+
elif args.command == "translate":
|
|
95
|
+
pipe.translate(args.input, args.output, target_lang=args.lang, columns=args.cols)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Configuration management: ~/.mlate/config.json + Env var auto-detection
|
|
2
|
+
|
|
3
|
+
Security Design:
|
|
4
|
+
- API Keys are NEVER stored on disk.
|
|
5
|
+
- API Keys must be provided via environment variables (MLATE_API_KEY) or CLI arguments.
|
|
6
|
+
- Base URL and Model are non-sensitive and can be persisted safely.
|
|
7
|
+
"""
|
|
8
|
+
import os, json, sys, shutil
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from logger import logger
|
|
11
|
+
|
|
12
|
+
CONFIG_DIR = Path.home() / ".mlate"
|
|
13
|
+
CONFIG_FILE = CONFIG_DIR / "config.json"
|
|
14
|
+
# Priority for env vars
|
|
15
|
+
ENV_KEY_PRIORITY = ["MLATE_API_KEY", "LLM_API_KEY", "OPENAI_API_KEY", "DEEPSEEK_API_KEY"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _ensure_dir():
|
|
19
|
+
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
if sys.platform != "win32":
|
|
21
|
+
CONFIG_DIR.chmod(0o700)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load() -> dict:
|
|
25
|
+
if CONFIG_FILE.exists():
|
|
26
|
+
try:
|
|
27
|
+
with open(CONFIG_FILE) as f:
|
|
28
|
+
return json.load(f)
|
|
29
|
+
except (json.JSONDecodeError, OSError):
|
|
30
|
+
pass
|
|
31
|
+
return {}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def save(cfg: dict):
|
|
35
|
+
_ensure_dir()
|
|
36
|
+
tmp = CONFIG_FILE.with_suffix(".tmp")
|
|
37
|
+
with open(tmp, "w") as f:
|
|
38
|
+
json.dump(cfg, f, indent=2)
|
|
39
|
+
if sys.platform != "win32":
|
|
40
|
+
tmp.chmod(0o600)
|
|
41
|
+
tmp.rename(CONFIG_FILE)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def mask_key(key: str) -> str:
|
|
45
|
+
if len(key) <= 12:
|
|
46
|
+
return "****"
|
|
47
|
+
return key[:8] + "…" + key[-4:]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def resolve_api_key() -> str:
|
|
51
|
+
"""Resolution: Env vars only. (Security policy: No CLI or Config file storage)"""
|
|
52
|
+
for var in ENV_KEY_PRIORITY:
|
|
53
|
+
v = os.environ.get(var)
|
|
54
|
+
if v:
|
|
55
|
+
return v
|
|
56
|
+
return ""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def resolve_base_url(model: str, cli_value: str | None = None) -> str:
|
|
60
|
+
if cli_value:
|
|
61
|
+
return cli_value
|
|
62
|
+
env = os.environ.get("MLATE_API_BASE")
|
|
63
|
+
if env:
|
|
64
|
+
return env
|
|
65
|
+
cfg = load()
|
|
66
|
+
cfg_base = cfg.get("api_base")
|
|
67
|
+
if cfg_base:
|
|
68
|
+
return cfg_base
|
|
69
|
+
m = model.lower()
|
|
70
|
+
if any(k in m for k in ["deepseek"]):
|
|
71
|
+
return "https://api.deepseek.com"
|
|
72
|
+
if any(k in m for k in ["gpt", "o1", "o3"]):
|
|
73
|
+
return "https://api.openai.com/v1"
|
|
74
|
+
return "https://api.deepseek.com"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def resolve_model(cli_value: str | None = None) -> str:
|
|
78
|
+
if cli_value:
|
|
79
|
+
return cli_value
|
|
80
|
+
env = os.environ.get("MLATE_MODEL")
|
|
81
|
+
if env:
|
|
82
|
+
return env
|
|
83
|
+
cfg = load()
|
|
84
|
+
return cfg.get("model", "deepseek-chat")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ── CLI 交互 ──────────────────────────────────────────────────
|
|
88
|
+
def _print_setup_guide():
|
|
89
|
+
logger.info("")
|
|
90
|
+
logger.info("=" * 54)
|
|
91
|
+
logger.info(" API Key not set. Please use environment variables:", " API Key 未设置,请通过环境变量配置:")
|
|
92
|
+
logger.info("")
|
|
93
|
+
logger.info(" Set environment variable (Recommended):", " 设置环境变量(推荐):")
|
|
94
|
+
logger.info(" $env:MLATE_API_KEY = \"sk-xxx\" # Windows PowerShell")
|
|
95
|
+
logger.info(' export MLATE_API_KEY="sk-xxx" # Linux/macOS')
|
|
96
|
+
logger.info("=" * 54)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def cmd_config(args):
|
|
100
|
+
action = args.action
|
|
101
|
+
rest = args.args
|
|
102
|
+
|
|
103
|
+
if action == "init":
|
|
104
|
+
_ensure_dir()
|
|
105
|
+
if not CONFIG_FILE.exists():
|
|
106
|
+
save({})
|
|
107
|
+
logger.success(f"Created {CONFIG_FILE}", f"已创建 {CONFIG_FILE}")
|
|
108
|
+
else:
|
|
109
|
+
logger.info(f"Already exists: {CONFIG_FILE}", f"已存在: {CONFIG_FILE}")
|
|
110
|
+
|
|
111
|
+
elif action == "set":
|
|
112
|
+
if len(rest) < 2:
|
|
113
|
+
logger.info("Usage: mlate config set <key> <value>", "用法: mlate config set <key> <value>")
|
|
114
|
+
logger.info(" Keys: api_base, model, lang", " 可用键: api_base, model, lang")
|
|
115
|
+
return
|
|
116
|
+
key, value = rest[0], rest[1]
|
|
117
|
+
if key == "api_key":
|
|
118
|
+
logger.error("Security policy: api_key cannot be stored in config file.",
|
|
119
|
+
"安全策略:api_key 不支持存储在配置文件中。")
|
|
120
|
+
logger.info("Please use environment variable MLATE_API_KEY instead.",
|
|
121
|
+
"请改用环境变量 MLATE_API_KEY。")
|
|
122
|
+
return
|
|
123
|
+
cfg = load()
|
|
124
|
+
cfg[key] = value
|
|
125
|
+
save(cfg)
|
|
126
|
+
logger.success(f"Successfully set {key}", f"已设置 {key}")
|
|
127
|
+
|
|
128
|
+
elif action == "get":
|
|
129
|
+
if not rest:
|
|
130
|
+
logger.info("Usage: mlate config get <key>", "用法: mlate config get <key>")
|
|
131
|
+
logger.info(" Keys: api_base, model, lang", " 可用键: api_base, model, lang")
|
|
132
|
+
return
|
|
133
|
+
key = rest[0]
|
|
134
|
+
if key == "api_key":
|
|
135
|
+
print(resolve_api_key())
|
|
136
|
+
return
|
|
137
|
+
cfg = load()
|
|
138
|
+
print(cfg.get(key, ""))
|
|
139
|
+
|
|
140
|
+
elif action == "show":
|
|
141
|
+
logger.info(f"Config file: {CONFIG_FILE}", f"配置文件: {CONFIG_FILE}")
|
|
142
|
+
logger.info(f"Env variable: MLATE_API_KEY", f"环境变量: MLATE_API_KEY")
|
|
143
|
+
logger.info("")
|
|
144
|
+
|
|
145
|
+
cfg = load()
|
|
146
|
+
model = resolve_model()
|
|
147
|
+
base = resolve_base_url(model)
|
|
148
|
+
key = resolve_api_key()
|
|
149
|
+
|
|
150
|
+
key_source_cn = "环境变量" if key else "无"
|
|
151
|
+
key_source_en = "Env Var" if key else "None"
|
|
152
|
+
|
|
153
|
+
logger.info("Current effective configuration:", "当前生效配置:")
|
|
154
|
+
logger.info(f" model = {model}")
|
|
155
|
+
logger.info(f" api_base = {base}")
|
|
156
|
+
logger.info(f" log_lang = {cfg.get('lang', 'en')}")
|
|
157
|
+
logger.info(f" output_lang = {cfg.get('output_lang', '中文')}")
|
|
158
|
+
logger.info("")
|
|
159
|
+
|
|
160
|
+
if "api_key" in cfg:
|
|
161
|
+
del cfg["api_key"] # Clean up legacy key if exists
|
|
162
|
+
save(cfg)
|
|
163
|
+
|
|
164
|
+
if cfg:
|
|
165
|
+
logger.info(f"Config file content ({len(cfg)} items):", f"配置文件内容 ({len(cfg)} 项):")
|
|
166
|
+
for k, v in cfg.items():
|
|
167
|
+
logger.info(f" {k} = {v}")
|
|
168
|
+
else:
|
|
169
|
+
logger.info("Config file is empty", "配置文件为空")
|
|
170
|
+
|
|
171
|
+
if not key:
|
|
172
|
+
_print_setup_guide()
|
llm.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""LLM 调用封装 + RateLimiter
|
|
2
|
+
|
|
3
|
+
API Key 与 Base URL 解析委托给 config 模块,实现:
|
|
4
|
+
CLI 参数 > 环境变量 > 配置文件 > 模型名推断
|
|
5
|
+
"""
|
|
6
|
+
import os, time, threading, json
|
|
7
|
+
from openai import OpenAI
|
|
8
|
+
import config
|
|
9
|
+
from logger import logger
|
|
10
|
+
|
|
11
|
+
__all__ = ["RateLimiter", "LLM"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RateLimiter:
|
|
15
|
+
def __init__(self, qps: float = 2.0):
|
|
16
|
+
self.qps = float(qps)
|
|
17
|
+
self.min_interval = 1.0 / self.qps if self.qps > 0 else 0
|
|
18
|
+
self._lock = threading.Lock()
|
|
19
|
+
self._next_time = 0.0
|
|
20
|
+
|
|
21
|
+
def wait(self):
|
|
22
|
+
if self.min_interval <= 0:
|
|
23
|
+
return
|
|
24
|
+
with self._lock:
|
|
25
|
+
now = time.time()
|
|
26
|
+
if now < self._next_time:
|
|
27
|
+
time.sleep(self._next_time - now)
|
|
28
|
+
self._next_time = max(now, self._next_time) + self.min_interval
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LLM:
|
|
32
|
+
def __init__(self, model: str = None, api_base: str = None):
|
|
33
|
+
self.model = config.resolve_model(model)
|
|
34
|
+
self.api_key = config.resolve_api_key()
|
|
35
|
+
self.api_base = config.resolve_base_url(self.model, api_base)
|
|
36
|
+
|
|
37
|
+
if not self.api_key:
|
|
38
|
+
logger.error("API Key not set. Please configure via environment variables:", "API Key 未设置。请通过环境变量配置:")
|
|
39
|
+
logger.info(" $env:MLATE_API_KEY = \"sk-xxx\" # Windows PowerShell")
|
|
40
|
+
logger.info(' export MLATE_API_KEY="sk-xxx" # Linux/macOS')
|
|
41
|
+
raise ValueError("MLATE_API_KEY not set")
|
|
42
|
+
|
|
43
|
+
self.client = OpenAI(base_url=self.api_base, api_key=self.api_key)
|
|
44
|
+
|
|
45
|
+
def chat(self, messages: list, temperature: float = 0.2, retries: int = 2) -> str | None:
|
|
46
|
+
"""Standard chat completion for plain text"""
|
|
47
|
+
last_err = None
|
|
48
|
+
for attempt in range(retries + 1):
|
|
49
|
+
try:
|
|
50
|
+
resp = self.client.chat.completions.create(
|
|
51
|
+
model=self.model,
|
|
52
|
+
messages=messages,
|
|
53
|
+
temperature=temperature,
|
|
54
|
+
)
|
|
55
|
+
return resp.choices[0].message.content
|
|
56
|
+
except Exception as e:
|
|
57
|
+
last_err = e
|
|
58
|
+
if attempt < retries:
|
|
59
|
+
time.sleep(1)
|
|
60
|
+
logger.error(f"LLM 调用失败: {last_err}", f"LLM call failed: {last_err}")
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
def chat_json(self, messages: list, temperature: float = 0.2, retries: int = 2) -> dict | None:
|
|
64
|
+
last_err = None
|
|
65
|
+
for attempt in range(retries + 1):
|
|
66
|
+
try:
|
|
67
|
+
resp = self.client.chat.completions.create(
|
|
68
|
+
model=self.model,
|
|
69
|
+
messages=messages,
|
|
70
|
+
response_format={"type": "json_object"},
|
|
71
|
+
temperature=temperature,
|
|
72
|
+
)
|
|
73
|
+
return json.loads(resp.choices[0].message.content)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
last_err = e
|
|
76
|
+
if attempt < retries:
|
|
77
|
+
time.sleep(1)
|
|
78
|
+
logger.error(f"LLM 调用失败: {last_err}", f"LLM call failed: {last_err}")
|
|
79
|
+
return None
|
loaders.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from logger import logger
|
|
4
|
+
|
|
5
|
+
class BaseLoader:
|
|
6
|
+
"""Base class for data loaders."""
|
|
7
|
+
|
|
8
|
+
# Common column mappings for different sources
|
|
9
|
+
MAPPINGS = {
|
|
10
|
+
"wos": {
|
|
11
|
+
"title": ["TI", "Article Title", "Title"],
|
|
12
|
+
"abstract": ["AB", "Abstract"],
|
|
13
|
+
"keywords": ["DE", "Author Keywords", "ID", "Keywords Plus", "Keywords"]
|
|
14
|
+
},
|
|
15
|
+
"scopus": {
|
|
16
|
+
"title": ["Title"],
|
|
17
|
+
"abstract": ["Abstract"],
|
|
18
|
+
"keywords": ["Author Keywords", "Index Keywords"]
|
|
19
|
+
},
|
|
20
|
+
"standard": {
|
|
21
|
+
"title": ["Title", "题目", "标题"],
|
|
22
|
+
"abstract": ["Abstract", "摘要"],
|
|
23
|
+
"keywords": ["Keywords", "Author Keywords", "关键词"]
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def load(file_path: str, source_type: str = "auto") -> tuple[pd.DataFrame, dict]:
|
|
29
|
+
"""Load literature data and return (df, mapping)."""
|
|
30
|
+
path = Path(file_path)
|
|
31
|
+
ext = path.suffix.lower()
|
|
32
|
+
|
|
33
|
+
# 1. Load the raw data
|
|
34
|
+
try:
|
|
35
|
+
if ext == ".csv":
|
|
36
|
+
df = pd.read_csv(file_path, encoding="utf-8-sig")
|
|
37
|
+
elif ext in [".xls", ".xlsx"]:
|
|
38
|
+
df = pd.read_excel(file_path)
|
|
39
|
+
elif ext in [".txt", ".tsv"]:
|
|
40
|
+
df = pd.read_csv(file_path, sep="\t", encoding="utf-8-sig")
|
|
41
|
+
else:
|
|
42
|
+
raise ValueError(f"Unsupported file format: {ext}")
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Failed to load file {file_path}: {e}")
|
|
45
|
+
raise
|
|
46
|
+
|
|
47
|
+
# 2. Adaptive column mapping (without renaming)
|
|
48
|
+
return BaseLoader.identify_columns(df, source_type)
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def identify_columns(df: pd.DataFrame, source_type: str = "auto") -> tuple[pd.DataFrame, dict]:
|
|
52
|
+
"""Identify key columns and return (df_cleaned, mapping)."""
|
|
53
|
+
# Deduplicate columns first
|
|
54
|
+
df = df.loc[:, ~df.columns.duplicated()]
|
|
55
|
+
columns = df.columns.tolist()
|
|
56
|
+
|
|
57
|
+
mapping_to_use = {}
|
|
58
|
+
if source_type != "auto" and source_type in BaseLoader.MAPPINGS:
|
|
59
|
+
mapping_to_use = BaseLoader.MAPPINGS[source_type]
|
|
60
|
+
else:
|
|
61
|
+
best_score = -1
|
|
62
|
+
best_source = "standard"
|
|
63
|
+
for src, maps in BaseLoader.MAPPINGS.items():
|
|
64
|
+
score = sum(1 for target_list in maps.values()
|
|
65
|
+
for col in target_list if col in columns)
|
|
66
|
+
if score > best_score:
|
|
67
|
+
best_score = score
|
|
68
|
+
best_source = src
|
|
69
|
+
mapping_to_use = BaseLoader.MAPPINGS[best_source]
|
|
70
|
+
logger.info(f"Detected source type: {best_source}")
|
|
71
|
+
|
|
72
|
+
# Final mapping of original column names
|
|
73
|
+
res_mapping = {"title": "Title", "abstract": "Abstract", "keywords": "Keywords"}
|
|
74
|
+
found = {"title": False, "abstract": False, "keywords": False}
|
|
75
|
+
|
|
76
|
+
# 1. Try predefined mappings
|
|
77
|
+
for internal_name, candidates in mapping_to_use.items():
|
|
78
|
+
for cand in candidates:
|
|
79
|
+
if cand in columns:
|
|
80
|
+
res_mapping[internal_name] = cand
|
|
81
|
+
found[internal_name] = True
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
# 2. Heuristic fallback
|
|
85
|
+
if not found["title"]:
|
|
86
|
+
for col in columns:
|
|
87
|
+
if any(kw in col.lower() for kw in ["title", "ti"]):
|
|
88
|
+
res_mapping["title"] = col
|
|
89
|
+
found["title"] = True
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
if not found["abstract"]:
|
|
93
|
+
for col in columns:
|
|
94
|
+
if any(kw in col.lower() for kw in ["abstract", "ab", "summary"]):
|
|
95
|
+
res_mapping["abstract"] = col
|
|
96
|
+
found["abstract"] = True
|
|
97
|
+
break
|
|
98
|
+
|
|
99
|
+
if not found["title"]:
|
|
100
|
+
logger.warning("Could not find Title column automatically.")
|
|
101
|
+
|
|
102
|
+
# Log the identified mappings
|
|
103
|
+
logger.section("列名映射确认", "Column Mapping Confirmation")
|
|
104
|
+
logger.info(f" 题目 (Title) : {res_mapping['title']}")
|
|
105
|
+
logger.info(f" 摘要 (Abstract) : {res_mapping['abstract']}")
|
|
106
|
+
logger.info(f" 关键词 (Keywords): {res_mapping['keywords']}")
|
|
107
|
+
|
|
108
|
+
return df, res_mapping
|
logger.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
class MLATELogger:
|
|
6
|
+
"""Logger supporting English (default), Chinese, and bilingual output."""
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
self._logger = logging.getLogger("mlate")
|
|
10
|
+
self._logger.setLevel(logging.INFO)
|
|
11
|
+
|
|
12
|
+
# Default output to stdout
|
|
13
|
+
self._handler = logging.StreamHandler(sys.stdout)
|
|
14
|
+
self._formatter = logging.Formatter("%(message)s")
|
|
15
|
+
self._handler.setFormatter(self._formatter)
|
|
16
|
+
self._logger.addHandler(self._handler)
|
|
17
|
+
|
|
18
|
+
# Language settings: 'cn', 'en', 'both'
|
|
19
|
+
# Priority: Env var MLATE_LANG > Default (en)
|
|
20
|
+
self.lang = os.environ.get("MLATE_LANG", "en").lower()
|
|
21
|
+
|
|
22
|
+
def set_level(self, level):
|
|
23
|
+
"""Set logging level."""
|
|
24
|
+
self._logger.setLevel(level)
|
|
25
|
+
|
|
26
|
+
def set_lang(self, lang: str):
|
|
27
|
+
"""Set output language: 'cn', 'en', 'both'"""
|
|
28
|
+
if lang in ["cn", "en", "both"]:
|
|
29
|
+
self.lang = lang
|
|
30
|
+
|
|
31
|
+
def _format(self, cn: str, en: str = None) -> str:
|
|
32
|
+
"""Format message based on language settings."""
|
|
33
|
+
if en is None:
|
|
34
|
+
return cn
|
|
35
|
+
|
|
36
|
+
if self.lang == "cn":
|
|
37
|
+
return cn
|
|
38
|
+
elif self.lang == "en":
|
|
39
|
+
return en
|
|
40
|
+
else:
|
|
41
|
+
# Bilingual mode
|
|
42
|
+
return f"{cn} | {en}"
|
|
43
|
+
|
|
44
|
+
def info(self, cn: str, en: str = None):
|
|
45
|
+
"""Log info message."""
|
|
46
|
+
self._logger.info(self._format(cn, en))
|
|
47
|
+
|
|
48
|
+
def error(self, cn: str, en: str = None):
|
|
49
|
+
"""Log error message."""
|
|
50
|
+
self._logger.error(self._format(cn, en))
|
|
51
|
+
|
|
52
|
+
def warning(self, cn: str, en: str = None):
|
|
53
|
+
"""Log warning message."""
|
|
54
|
+
self._logger.warning(self._format(cn, en))
|
|
55
|
+
|
|
56
|
+
def success(self, cn: str, en: str = None):
|
|
57
|
+
"""Log success message with a checkmark."""
|
|
58
|
+
self._logger.info(f"✓ {self._format(cn, en)}")
|
|
59
|
+
|
|
60
|
+
def section(self, cn: str, en: str = None):
|
|
61
|
+
"""Log section header."""
|
|
62
|
+
msg = self._format(cn, en)
|
|
63
|
+
self._logger.info(f"\n── {msg} ──")
|
|
64
|
+
|
|
65
|
+
def separator(self):
|
|
66
|
+
"""Log separator line."""
|
|
67
|
+
self._logger.info("-" * 50)
|
|
68
|
+
|
|
69
|
+
# 全局单例
|
|
70
|
+
logger = MLATELogger()
|