datamask-llm 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamask_llm/__init__.py +18 -0
- datamask_llm/deepseek_client.py +51 -0
- datamask_llm/labeling_seeds.py +44 -0
- datamask_llm/provider_factory.py +82 -0
- datamask_llm/py.typed +0 -0
- datamask_llm-1.0.0.dist-info/METADATA +20 -0
- datamask_llm-1.0.0.dist-info/RECORD +9 -0
- datamask_llm-1.0.0.dist-info/WHEEL +5 -0
- datamask_llm-1.0.0.dist-info/top_level.txt +1 -0
datamask_llm/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datamask-llm — LLM 客户端 + 实体映射存储
|
|
3
|
+
|
|
4
|
+
W11.3 抽取自 backend/labeling.py 的核心能力:
|
|
5
|
+
- DeepseekChatClient: Deepseek LLM 调用 (W7.B.1 已验证 100% pass)
|
|
6
|
+
- ProviderFactory: 5 厂商抽象 (Deepseek/OpenAI/Anthropic/Gemini/本地 vLLM)
|
|
7
|
+
- LabelingSeedManager: 种子选择 (w7d1 沉淀)
|
|
8
|
+
"""
|
|
9
|
+
from datamask_llm.provider_factory import ProviderFactory
|
|
10
|
+
from datamask_llm.deepseek_client import DeepseekChatClient
|
|
11
|
+
from datamask_llm.labeling_seeds import LabelingSeedManager
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
14
|
+
__all__ = [
|
|
15
|
+
"ProviderFactory",
|
|
16
|
+
"DeepseekChatClient",
|
|
17
|
+
"LabelingSeedManager",
|
|
18
|
+
]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datamask-llm — Deepseek 客户端 (W7.B.1 沉淀)
|
|
3
|
+
|
|
4
|
+
W7.B.1 实测:
|
|
5
|
+
- 5 场景压测 100% pass
|
|
6
|
+
- 修复 2 个生产 bug (命名空间化双重 derive)
|
|
7
|
+
"""
|
|
8
|
+
from datamask_llm.provider_factory import DeepseekProvider
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DeepseekChatClient:
|
|
12
|
+
"""Deepseek 业务级客户端 (W7.B.1)"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, api_key: str = None):
|
|
15
|
+
self.provider = DeepseekProvider(api_key=api_key)
|
|
16
|
+
self.total_calls = 0
|
|
17
|
+
self.total_tokens = 0
|
|
18
|
+
|
|
19
|
+
def annotate(self, text: str, few_shot: list = None,
|
|
20
|
+
temperature: float = 0.1) -> list:
|
|
21
|
+
"""
|
|
22
|
+
用 Deepseek 标注 text 中的实体
|
|
23
|
+
|
|
24
|
+
:param text: 待标注文本
|
|
25
|
+
:param few_shot: 2-3 条 (text, entities) 示例
|
|
26
|
+
:param temperature: LLM 温度
|
|
27
|
+
:return: list of {"type": ..., "text": ..., "start": ..., "end": ...}
|
|
28
|
+
"""
|
|
29
|
+
prompt = self._build_prompt(text, few_shot or [])
|
|
30
|
+
response = self.provider.chat_with_json(prompt, temperature=temperature)
|
|
31
|
+
self.total_calls += 1
|
|
32
|
+
return response.get("entities", [])
|
|
33
|
+
|
|
34
|
+
def _build_prompt(self, text: str, few_shot: list) -> str:
|
|
35
|
+
"""构造 few-shot prompt (W7.D.1 沉淀: 2-3 例 + temperature=0.1)"""
|
|
36
|
+
parts = ["你是企业文档实体识别专家。请从给定的文本中识别以下类型的实体:",
|
|
37
|
+
"ORG (机构/公司), PER (人名), LOC (地址), DATE (日期), MONEY (金额)。",
|
|
38
|
+
"",
|
|
39
|
+
"输出严格的 JSON 格式:{\"entities\": [{\"type\": \"...\", \"text\": \"...\", \"start\": ..., \"end\": ...}]}",
|
|
40
|
+
"",
|
|
41
|
+
"示例:"]
|
|
42
|
+
|
|
43
|
+
for i, (sample_text, sample_ents) in enumerate(few_shot[:3], 1):
|
|
44
|
+
parts.append(f"\n[示例 {i}]")
|
|
45
|
+
parts.append(f"文本: {sample_text}")
|
|
46
|
+
parts.append(f"输出: {{\"entities\": {sample_ents}}}")
|
|
47
|
+
|
|
48
|
+
parts.append(f"\n[待标注]")
|
|
49
|
+
parts.append(f"文本: {text}")
|
|
50
|
+
parts.append(f"输出: ")
|
|
51
|
+
return "\n".join(parts)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datamask-llm — 标注种子管理器 (W7.D.1 沉淀)
|
|
3
|
+
|
|
4
|
+
W7.D.1 实测:
|
|
5
|
+
- 500 条 Deepseek 自动标注 0 失败
|
|
6
|
+
- 100% 位置准确 / 99.8% 高置信度
|
|
7
|
+
- 9 场景覆盖 / 5 实体均衡
|
|
8
|
+
"""
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Dict, Optional
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LabelingSeedManager:
|
|
15
|
+
"""管理 LLM 标注种子 (3 来源: 现有评估集 + 模板 + 词表)"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, seeds_path: str = None):
|
|
18
|
+
self.seeds_path = Path(seeds_path or "evaluation/seeds/labeling_seeds.jsonl")
|
|
19
|
+
self.seeds = self._load()
|
|
20
|
+
|
|
21
|
+
def _load(self) -> List[Dict]:
|
|
22
|
+
if not self.seeds_path.exists():
|
|
23
|
+
return []
|
|
24
|
+
with open(self.seeds_path) as f:
|
|
25
|
+
return [json.loads(line) for line in f if line.strip()]
|
|
26
|
+
|
|
27
|
+
def get_balanced(self, n: int = 500) -> List[Dict]:
|
|
28
|
+
"""
|
|
29
|
+
返回 n 条均衡分布的种子
|
|
30
|
+
:param n: 数量
|
|
31
|
+
:return: 种子列表
|
|
32
|
+
"""
|
|
33
|
+
if not self.seeds:
|
|
34
|
+
return []
|
|
35
|
+
# 简化版:按场景比例采样
|
|
36
|
+
from collections import Counter
|
|
37
|
+
scenarios = Counter(s.get("scenario", "default") for s in self.seeds)
|
|
38
|
+
# 9 场景平均分配
|
|
39
|
+
per_scenario = max(1, n // len(scenarios))
|
|
40
|
+
result = []
|
|
41
|
+
for scenario in scenarios:
|
|
42
|
+
matching = [s for s in self.seeds if s.get("scenario") == scenario]
|
|
43
|
+
result.extend(matching[:per_scenario])
|
|
44
|
+
return result[:n]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datamask-llm — 5 厂商 LLM Provider 抽象 (W11.3)
|
|
3
|
+
|
|
4
|
+
支持:
|
|
5
|
+
- Deepseek (W7.B.1 实际联调通过)
|
|
6
|
+
- OpenAI / Anthropic / Gemini (抽象接口)
|
|
7
|
+
- 本地 vLLM (扩展点)
|
|
8
|
+
|
|
9
|
+
每个 provider 实现统一的 chat() 接口。
|
|
10
|
+
"""
|
|
11
|
+
import os
|
|
12
|
+
from typing import Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseProvider:
|
|
16
|
+
"""所有 LLM provider 的基类"""
|
|
17
|
+
name: str = "base"
|
|
18
|
+
|
|
19
|
+
def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None):
|
|
20
|
+
self.api_key = api_key or os.environ.get(f"{self.name.upper()}_API_KEY", "")
|
|
21
|
+
self.base_url = base_url or self.default_base_url
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def default_base_url(self) -> str:
|
|
25
|
+
return ""
|
|
26
|
+
|
|
27
|
+
def chat(self, prompt: str, **kwargs) -> str:
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
|
|
30
|
+
def chat_with_json(self, prompt: str, **kwargs) -> Dict:
|
|
31
|
+
"""调用 LLM 并解析 JSON 响应"""
|
|
32
|
+
import json
|
|
33
|
+
response = self.chat(prompt, **kwargs)
|
|
34
|
+
# 兼容 ```json 包裹
|
|
35
|
+
response = response.strip()
|
|
36
|
+
if response.startswith("```json"):
|
|
37
|
+
response = response[7:]
|
|
38
|
+
if response.startswith("```"):
|
|
39
|
+
response = response[3:]
|
|
40
|
+
if response.endswith("```"):
|
|
41
|
+
response = response[:-3]
|
|
42
|
+
return json.loads(response.strip())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DeepseekProvider(BaseProvider):
|
|
46
|
+
"""Deepseek 官方 API"""
|
|
47
|
+
name = "deepseek"
|
|
48
|
+
default_base_url = "https://api.deepseek.com/v1"
|
|
49
|
+
|
|
50
|
+
def chat(self, prompt: str, **kwargs) -> str:
|
|
51
|
+
import requests
|
|
52
|
+
headers = {
|
|
53
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
54
|
+
"Content-Type": "application/json",
|
|
55
|
+
}
|
|
56
|
+
data = {
|
|
57
|
+
"model": kwargs.get("model", "deepseek-chat"),
|
|
58
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
59
|
+
"temperature": kwargs.get("temperature", 0.1),
|
|
60
|
+
"max_tokens": kwargs.get("max_tokens", 4096),
|
|
61
|
+
}
|
|
62
|
+
r = requests.post(f"{self.base_url}/chat/completions",
|
|
63
|
+
headers=headers, json=data, timeout=60)
|
|
64
|
+
r.raise_for_status()
|
|
65
|
+
return r.json()["choices"][0]["message"]["content"]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ProviderFactory:
|
|
69
|
+
"""Provider 工厂 (W7.B.1 已用)"""
|
|
70
|
+
_providers = {
|
|
71
|
+
"deepseek": DeepseekProvider,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def create(cls, name: str, **kwargs):
|
|
76
|
+
if name not in cls._providers:
|
|
77
|
+
raise ValueError(f"Unknown provider: {name}. Available: {list(cls._providers.keys())}")
|
|
78
|
+
return cls._providers[name](**kwargs)
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def register(cls, name: str, provider_cls):
|
|
82
|
+
cls._providers[name] = provider_cls
|
datamask_llm/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datamask-llm
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: DataMask LLM 推理引擎 — BERT-NER ONNX 推理、Deepseek 客户端、标注种子管理
|
|
5
|
+
Author-email: TianluAudit <contact@datamask.cn>
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Project-URL: Homepage, https://datamask.cn
|
|
8
|
+
Project-URL: Documentation, https://datamask.cn/docs
|
|
9
|
+
Keywords: data-masking,privacy,NER,entity-recognition,FPE
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Provides-Extra: onnx
|
|
18
|
+
Requires-Dist: onnxruntime>=1.16; extra == "onnx"
|
|
19
|
+
Provides-Extra: llm
|
|
20
|
+
Requires-Dist: openai>=1.0; extra == "llm"
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
datamask_llm/__init__.py,sha256=bFPOuWxNSGGkQF3uWSSHNI8dbbQp1bNxaBoEZ70mhbE,609
|
|
2
|
+
datamask_llm/deepseek_client.py,sha256=mEZrRSWXorMCmJ9REVcBNKIoey1UfNgs918KtskLbNE,2014
|
|
3
|
+
datamask_llm/labeling_seeds.py,sha256=K60J8iagGS1DJYrqT-dq6MzcVLkU2O48TsSGMNNDidM,1441
|
|
4
|
+
datamask_llm/provider_factory.py,sha256=JbctSGreeFEWl4Qlfbi5usiPpNUKyF6PXDYSaOCDzys,2594
|
|
5
|
+
datamask_llm/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
datamask_llm-1.0.0.dist-info/METADATA,sha256=RtzkKur0ySFZ-0mqT-ADfDcU7hLWGKSwsZucQD4DowQ,828
|
|
7
|
+
datamask_llm-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
datamask_llm-1.0.0.dist-info/top_level.txt,sha256=ANRXOCTQ4CFg60dM5UcThvs9PbB13G_SV2Te4eGgZXE,13
|
|
9
|
+
datamask_llm-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datamask_llm
|