alayaflow 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alayaflow-0.1.3/.alaya.ai/alayaflow/workflows/autotable/1.0.0/metadata.json +9 -0
- alayaflow-0.1.3/.alaya.ai/alayaflow/workflows/autotable/1.0.0/metadata.py +17 -0
- {alayaflow-0.1.2/.alaya.ai/alayaflow/workflows/simple_chat → alayaflow-0.1.3/.alaya.ai/alayaflow/workflows/autotable}/1.0.0/requirements.txt +5 -2
- alayaflow-0.1.3/.alaya.ai/alayaflow/workflows/autotable/1.0.0/schemas.py +44 -0
- alayaflow-0.1.3/.alaya.ai/alayaflow/workflows/autotable/1.0.0/utils.py +35 -0
- alayaflow-0.1.3/.alaya.ai/alayaflow/workflows/autotable/1.0.0/workflow.py +267 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/PKG-INFO +1 -1
- {alayaflow-0.1.2 → alayaflow-0.1.3}/examples/autotable_demo.py +26 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/examples/chat_demo.py +8 -1
- {alayaflow-0.1.2 → alayaflow-0.1.3}/pyproject.toml +1 -1
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/__init__.py +1 -1
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/llm_node.py +4 -5
- alayaflow-0.1.3/src/alayaflow/component/retrieve_node.py +77 -0
- alayaflow-0.1.2/.alaya.ai/alayaflow/workflows/autotable/1.0.0/metadata.json +0 -9
- alayaflow-0.1.2/.alaya.ai/alayaflow/workflows/autotable/1.0.0/workflow.py +0 -400
- alayaflow-0.1.2/src/alayaflow/component/retrieve_node.py +0 -11
- {alayaflow-0.1.2 → alayaflow-0.1.3}/.alaya.ai/alayaflow/workflows/simple_chat/1.0.0/metadata.json +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/.alaya.ai/alayaflow/workflows/simple_chat/1.0.0/metadata.py +0 -0
- {alayaflow-0.1.2/.alaya.ai/alayaflow/workflows/autotable → alayaflow-0.1.3/.alaya.ai/alayaflow/workflows/simple_chat}/1.0.0/requirements.txt +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/.alaya.ai/alayaflow/workflows/simple_chat/1.0.0/schemas.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/.alaya.ai/alayaflow/workflows/simple_chat/1.0.0/workflow.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/.github/workflows/pr-test.yml +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/.gitignore +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/LICENSE +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/README.md +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/pyproject.origin.toml +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/api/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/api/api_singleton.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/clients/alayamem/base_client.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/clients/alayamem/http_client.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/common/config.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/chat_model.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/intent_classifier.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/langflow/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/langflow/intent_classifier.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/memory.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/model/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/model/model_manager.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/model/schemas.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/search_node.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/component/web_search.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/env_manager.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/executor_manager.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/executors/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/executors/base_executor.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/executors/naive_executor.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/executors/uv_executor.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/executors/worker_executor.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/langfuse_tracing.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/execution/workflow_runner.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/utils/singleton.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/runnable/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/runnable/base_runnable_workflow.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/runnable/state_graph_runnable_workflow.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/workflow_info.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/workflow_loader.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/workflow_manager.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/clients/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/clients/conftest.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/clients/test_alayamem.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/component/test_intent_classifier.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/component/test_llm_node.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/execution/test_env_reuse.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/workflow/__init__.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/workflow/conftest.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/tests/workflow/test_workflow_loader.py +0 -0
- {alayaflow-0.1.2 → alayaflow-0.1.3}/uv.lock +0 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from alayaflow.workflow import WorkflowInfo
|
|
4
|
+
|
|
5
|
+
def get_metadata():
|
|
6
|
+
meta = {
|
|
7
|
+
"id": "autotable",
|
|
8
|
+
"name": "Auto Table Filler",
|
|
9
|
+
"description": "基于知识库的智能表格自动填写工作流",
|
|
10
|
+
"version": "1.0.0",
|
|
11
|
+
"tags": ["extraction", "table", "rag"],
|
|
12
|
+
"entry_file": "workflow.py",
|
|
13
|
+
"entry_point": "create_graph",
|
|
14
|
+
"wf_dir": Path(__file__).parent
|
|
15
|
+
}
|
|
16
|
+
return WorkflowInfo(**meta)
|
|
17
|
+
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
# LangGraph 核心依赖
|
|
2
2
|
langgraph>=0.2.0
|
|
3
3
|
|
|
4
|
+
# LangChain Core
|
|
5
|
+
langchain-core>=0.3.0
|
|
6
|
+
|
|
4
7
|
# LangChain Community (用于 ChatOpenAI)
|
|
5
8
|
langchain-community>=0.3.0
|
|
6
9
|
|
|
7
10
|
# OpenAI SDK (DeepSeek API 兼容 OpenAI 格式)
|
|
8
11
|
openai>=1.0.0
|
|
9
12
|
|
|
10
|
-
# Langfuse
|
|
11
|
-
langfuse>=3.0.0,<4.0.0
|
|
13
|
+
# Langfuse (可选,用于追踪)
|
|
14
|
+
langfuse>=3.0.0,<4.0.0
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List, Any, Union, TypeAlias, TypedDict, Annotated
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from .utils import merge_dicts, deep_merge
|
|
7
|
+
|
|
8
|
+
FieldSpec: TypeAlias = Union[str, Dict[str, List[Any]]]
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class GroupTask:
|
|
12
|
+
path: tuple[str, ...]
|
|
13
|
+
keys: tuple[str, ...]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class WorkflowInitArgs(BaseModel):
|
|
17
|
+
alayamem_url: str = Field(..., description="AlayaMem 服务地址")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Input(BaseModel):
|
|
21
|
+
fields: List[FieldSpec] = Field(..., description="要填写的表格字段模板(支持嵌套结构)")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class WorkflowContext(BaseModel):
|
|
25
|
+
collection_name: str = Field(default="file_watcher_collection", description="检索的集合名称")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Output(BaseModel):
|
|
29
|
+
final_result: Dict[str, Any] = Field(..., description="填写完成的表格数据(嵌套结构)")
|
|
30
|
+
context_by_task: Dict[str, List[str]] = Field(default_factory=dict, description="每个任务检索到的文档片段(调试用)")
|
|
31
|
+
errors: Dict[str, str] = Field(default_factory=dict, description="错误信息")
|
|
32
|
+
tasks: List[GroupTask] = Field(default_factory=list, description="规划的任务列表")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class OverallState(TypedDict):
|
|
36
|
+
fields: List[FieldSpec]
|
|
37
|
+
final_result: Annotated[Dict[str, Any], deep_merge]
|
|
38
|
+
context_by_task: Annotated[Dict[str, List[str]], merge_dicts]
|
|
39
|
+
errors: Annotated[Dict[str, str], merge_dicts]
|
|
40
|
+
tasks: List[GroupTask]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TaskState(TypedDict):
|
|
44
|
+
task: GroupTask
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Tuple
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def make_patch(path: Tuple[str, ...], kv: Dict[str, str]) -> Dict[str, Any]:
|
|
5
|
+
"""
|
|
6
|
+
path=("个人信息","联系方式"), kv={"电话":"..","邮箱":".."} =>
|
|
7
|
+
{"个人信息":{"联系方式":{"电话":"..","邮箱":".."}}}
|
|
8
|
+
"""
|
|
9
|
+
node: Dict[str, Any] = dict(kv)
|
|
10
|
+
for p in reversed(path):
|
|
11
|
+
node = {p: node}
|
|
12
|
+
return node
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def slim_docs(docs: List[str], max_doc_chars: int) -> List[str]:
|
|
16
|
+
out = []
|
|
17
|
+
for d in docs or []:
|
|
18
|
+
s = str(d)
|
|
19
|
+
if len(s) > max_doc_chars:
|
|
20
|
+
s = s[:max_doc_chars] + "…"
|
|
21
|
+
out.append(s)
|
|
22
|
+
return out
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def merge_dicts(a: Dict, b: Dict) -> Dict:
|
|
26
|
+
return {**a, **b}
|
|
27
|
+
|
|
28
|
+
def deep_merge(a: Dict[str, Any], b: Dict[str, Any]) -> Dict[str, Any]:
|
|
29
|
+
out = dict(a or {})
|
|
30
|
+
for k, v in (b or {}).items():
|
|
31
|
+
if k in out and isinstance(out[k], dict) and isinstance(v, dict):
|
|
32
|
+
out[k] = deep_merge(out[k], v)
|
|
33
|
+
else:
|
|
34
|
+
out[k] = v
|
|
35
|
+
return out
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from threading import Semaphore
|
|
5
|
+
|
|
6
|
+
from langgraph.graph import StateGraph, START, END
|
|
7
|
+
from langgraph.types import Send
|
|
8
|
+
from langchain_core.runnables import RunnableConfig
|
|
9
|
+
|
|
10
|
+
from alayaflow.component.llm_node import LLMComponent, ResponseFormat
|
|
11
|
+
from alayaflow.clients.alayamem.http_client import HttpAlayaMemClient
|
|
12
|
+
from alayaflow.component.retrieve_node import RetrieveComponent
|
|
13
|
+
|
|
14
|
+
from .schemas import (
|
|
15
|
+
FieldSpec, GroupTask, WorkflowInitArgs,
|
|
16
|
+
OverallState, TaskState, WorkflowContext, Input, Output
|
|
17
|
+
)
|
|
18
|
+
from .utils import make_patch, slim_docs
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def flatten_leaf_tasks(specs: List[FieldSpec], base_path: Optional[List[str]] = None) -> List[Tuple[Tuple[str, ...], str]]:
|
|
22
|
+
"""
|
|
23
|
+
返回:[(path_tuple, leaf_key), ...]
|
|
24
|
+
"""
|
|
25
|
+
base_path = base_path or []
|
|
26
|
+
out: List[Tuple[Tuple[str, ...], str]] = []
|
|
27
|
+
|
|
28
|
+
def _as_list(x: Any) -> List[Any]:
|
|
29
|
+
if x is None:
|
|
30
|
+
return []
|
|
31
|
+
if isinstance(x, list):
|
|
32
|
+
return x
|
|
33
|
+
return [x]
|
|
34
|
+
|
|
35
|
+
for item in specs or []:
|
|
36
|
+
if isinstance(item, str):
|
|
37
|
+
out.append((tuple(base_path), item))
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
if isinstance(item, dict):
|
|
41
|
+
for parent, children in item.items():
|
|
42
|
+
for child in _as_list(children):
|
|
43
|
+
if isinstance(child, str):
|
|
44
|
+
out.append((tuple(base_path + [parent]), child))
|
|
45
|
+
elif isinstance(child, dict):
|
|
46
|
+
out.extend(flatten_leaf_tasks([child], base_path + [parent]))
|
|
47
|
+
else:
|
|
48
|
+
pass
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
return out
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def create_plan_node():
|
|
55
|
+
"""创建规划节点:将输入的字段模板规划为多个独立的抽取任务"""
|
|
56
|
+
def plan_node(state: OverallState, config: RunnableConfig):
|
|
57
|
+
leaf = flatten_leaf_tasks(state["fields"])
|
|
58
|
+
grouped: Dict[Tuple[str, ...], List[str]] = defaultdict(list)
|
|
59
|
+
for path, key in leaf:
|
|
60
|
+
grouped[path].append(key)
|
|
61
|
+
|
|
62
|
+
tasks: List[GroupTask] = []
|
|
63
|
+
for path, keys in grouped.items():
|
|
64
|
+
# 去重保持顺序
|
|
65
|
+
seen = set()
|
|
66
|
+
uniq = []
|
|
67
|
+
for k in keys:
|
|
68
|
+
if k not in seen:
|
|
69
|
+
seen.add(k)
|
|
70
|
+
uniq.append(k)
|
|
71
|
+
tasks.append(GroupTask(path=path, keys=tuple(uniq)))
|
|
72
|
+
|
|
73
|
+
tasks.sort(key=lambda t: (len(t.path), t.path))
|
|
74
|
+
return {"tasks": tasks}
|
|
75
|
+
|
|
76
|
+
return plan_node
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def create_extract_task_node(client: HttpAlayaMemClient):
|
|
80
|
+
TOP_K: int = 5
|
|
81
|
+
MAX_DOC_CHARS: int = 400
|
|
82
|
+
MODEL_ID: str = "deepseek-chat"
|
|
83
|
+
MAX_CONCURRENCY: int = 10
|
|
84
|
+
|
|
85
|
+
limiter = Semaphore(MAX_CONCURRENCY)
|
|
86
|
+
|
|
87
|
+
def _build_system_prompt(keys: list[str]) -> str:
|
|
88
|
+
keys_str = ", ".join(keys)
|
|
89
|
+
|
|
90
|
+
return f"""
|
|
91
|
+
你是一个严谨的“局部字段抽取器”(table patch extractor)。
|
|
92
|
+
|
|
93
|
+
你的任务是:**只为指定字段抽取值**,严格依据提供的知识片段,不得猜测或编造。
|
|
94
|
+
|
|
95
|
+
通用规则:
|
|
96
|
+
1. 输出必须是严格合法 JSON,不允许包含解释、Markdown、代码块或多余文本。
|
|
97
|
+
2. **只允许输出以下字段(不多不少)**:{keys_str}
|
|
98
|
+
3. 所有字段值必须是字符串。
|
|
99
|
+
4. 找不到 / 不确定 / 空值 / 占位符 → 必须输出空字符串 ""。
|
|
100
|
+
5. 字段名可能存在空格或轻微变体(如“姓 名”≈“姓名”),允许智能匹配,但不得扩展到未指定字段。
|
|
101
|
+
|
|
102
|
+
长文本字段格式规则(必须遵守):
|
|
103
|
+
- 当字段内容包含**多个条目、多个时间段或多段经历**时:
|
|
104
|
+
- 必须使用序号列表格式。
|
|
105
|
+
- **每个条目占一行,条目之间必须使用 "\n" 换行符分隔。**
|
|
106
|
+
- 不允许使用分号、顿号、逗号等方式合并多个条目到同一行。
|
|
107
|
+
- 示例正确格式:
|
|
108
|
+
"1.第一条内容\n2.第二条内容\n3.第三条内容"
|
|
109
|
+
|
|
110
|
+
表格单元格理解规则(重要):
|
|
111
|
+
- 知识片段可能来自表格,每行使用 " | " 分隔单元格。
|
|
112
|
+
- "<空>" 表示空单元格,对应值为 ""。
|
|
113
|
+
- 字段名后不一定是值:
|
|
114
|
+
- 若字段名后是 "<空>" → 值为 ""。
|
|
115
|
+
- 若字段名后是另一个字段名 → 继续向后寻找第一个“非字段名 / 非占位符”的单元格作为值。
|
|
116
|
+
- 示例:"字段A | 字段B | 值" → 字段A="", 字段B="值"。
|
|
117
|
+
|
|
118
|
+
占位符识别:
|
|
119
|
+
- 若候选值是模板占位符或签字日期类文本
|
|
120
|
+
(如“签字: 年 月 日”“学院盖章: 年 月 日”等),必须返回 ""。
|
|
121
|
+
""".strip()
|
|
122
|
+
|
|
123
|
+
def _build_user_prompt(content_text: str, path: list[str], keys: list[str],) -> str:
|
|
124
|
+
path_str = " / ".join(path) if path else "<root>"
|
|
125
|
+
keys_str = ", ".join(keys)
|
|
126
|
+
|
|
127
|
+
json_skeleton = "{\n" + ",\n".join([f' "{k}": ""' for k in keys]) + "\n}"
|
|
128
|
+
|
|
129
|
+
return f"""
|
|
130
|
+
【本次任务定位】
|
|
131
|
+
字段路径(仅用于语义定位,不要输出):{path_str}
|
|
132
|
+
需要抽取的字段:{keys_str}
|
|
133
|
+
|
|
134
|
+
【知识库片段】
|
|
135
|
+
{content_text}
|
|
136
|
+
|
|
137
|
+
【输出要求】
|
|
138
|
+
- 只输出一个 JSON 对象
|
|
139
|
+
- key 必须严格为:{keys_str}
|
|
140
|
+
- 无法确定 / 空值 / 占位符 → 输出 ""
|
|
141
|
+
|
|
142
|
+
【JSON 输出模板】
|
|
143
|
+
{json_skeleton}
|
|
144
|
+
""".strip()
|
|
145
|
+
|
|
146
|
+
def node(state: TaskState, config: RunnableConfig):
|
|
147
|
+
task = state["task"]
|
|
148
|
+
path = task.path
|
|
149
|
+
keys = list(task.keys)
|
|
150
|
+
|
|
151
|
+
task_id = f"{'/'.join(path) or '<root>'}:{','.join(keys)}"
|
|
152
|
+
|
|
153
|
+
default_kv = {k: "" for k in keys}
|
|
154
|
+
default_patch = make_patch(path, default_kv)
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
with limiter:
|
|
158
|
+
# 从 config 中获取 collection_name(运行时参数)
|
|
159
|
+
config_dict = config.get("configurable", {}) if isinstance(config, dict) else {}
|
|
160
|
+
collection_name = config_dict.get("collection_name", "file_watcher_collection")
|
|
161
|
+
|
|
162
|
+
# 1) 检索 query:路径信息 + keys
|
|
163
|
+
# 把上层标题带进去提升命中
|
|
164
|
+
query_parts = list(path) + keys
|
|
165
|
+
query = ";".join([p for p in query_parts if p])
|
|
166
|
+
|
|
167
|
+
retrieve_component = RetrieveComponent(client=client)
|
|
168
|
+
docs = retrieve_component(query=query, collection_name=collection_name, limit=TOP_K)
|
|
169
|
+
docs = slim_docs(docs, MAX_DOC_CHARS)
|
|
170
|
+
|
|
171
|
+
# 没 docs:直接返回默认
|
|
172
|
+
if not docs:
|
|
173
|
+
return {
|
|
174
|
+
"context_by_task": {task_id: []},
|
|
175
|
+
"final_result": default_patch,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
formatted_context = "\n\n".join(
|
|
179
|
+
[f"片段 {i+1}: {doc}" for i, doc in enumerate(docs)]
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# 2) 一次性抽取 keys(严格 JSON object)
|
|
183
|
+
json_skeleton = "{\n" + ",\n".join([f' "{k}": ""' for k in keys]) + "\n}"
|
|
184
|
+
|
|
185
|
+
system_prompt = _build_system_prompt(keys)
|
|
186
|
+
user_prompt = _build_user_prompt(formatted_context, path, keys)
|
|
187
|
+
|
|
188
|
+
llm = LLMComponent(
|
|
189
|
+
model_id=MODEL_ID,
|
|
190
|
+
system_prompt=system_prompt,
|
|
191
|
+
prompt=user_prompt,
|
|
192
|
+
response_format=ResponseFormat.JSON,
|
|
193
|
+
temperature=0.0,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
msg = llm()
|
|
197
|
+
obj = json.loads(msg.content)
|
|
198
|
+
|
|
199
|
+
extracted = {}
|
|
200
|
+
for k in keys:
|
|
201
|
+
v = obj.get(k, "")
|
|
202
|
+
extracted[k] = (str(v).strip() if v is not None else "")
|
|
203
|
+
|
|
204
|
+
patch = make_patch(path, extracted)
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
"context_by_task": {task_id: docs},
|
|
208
|
+
"final_result": patch,
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
except Exception as e:
|
|
212
|
+
return {
|
|
213
|
+
"context_by_task": {task_id: []},
|
|
214
|
+
"final_result": default_patch,
|
|
215
|
+
"errors": {task_id: f"{type(e).__name__}: {e}"},
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return node
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def create_validate_node():
|
|
222
|
+
def validate_node(state: OverallState):
|
|
223
|
+
res = state.get("final_result", {}) or {}
|
|
224
|
+
missing = []
|
|
225
|
+
|
|
226
|
+
def get_in(d: Dict[str, Any], path: Tuple[str, ...]) -> Dict[str, Any]:
|
|
227
|
+
cur = d
|
|
228
|
+
for p in path:
|
|
229
|
+
if not isinstance(cur, dict):
|
|
230
|
+
return {}
|
|
231
|
+
cur = cur.get(p, {})
|
|
232
|
+
return cur if isinstance(cur, dict) else {}
|
|
233
|
+
|
|
234
|
+
for t in state["tasks"]:
|
|
235
|
+
scope = get_in(res, t.path)
|
|
236
|
+
for k in t.keys:
|
|
237
|
+
if not str(scope.get(k, "")).strip():
|
|
238
|
+
missing.append((".".join(t.path + (k,))) if t.path else k)
|
|
239
|
+
|
|
240
|
+
if missing:
|
|
241
|
+
return {"errors": {"missing": ";".join(missing)}}
|
|
242
|
+
return {}
|
|
243
|
+
|
|
244
|
+
return validate_node
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def map_tasks(state: OverallState):
|
|
248
|
+
return [Send("extract_task", {"task": t}) for t in state["tasks"]]
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def create_graph(init_args: WorkflowInitArgs | Dict[str, Any]):
|
|
252
|
+
if isinstance(init_args, dict):
|
|
253
|
+
init_args = WorkflowInitArgs(**init_args)
|
|
254
|
+
client = HttpAlayaMemClient(init_args.alayamem_url)
|
|
255
|
+
|
|
256
|
+
g = StateGraph(OverallState, WorkflowContext, input_type=Input, output_type=Output)
|
|
257
|
+
g.add_node("plan", create_plan_node())
|
|
258
|
+
g.add_node("extract_task", create_extract_task_node(client))
|
|
259
|
+
g.add_node("validate", create_validate_node())
|
|
260
|
+
|
|
261
|
+
g.add_edge(START, "plan")
|
|
262
|
+
g.add_conditional_edges("plan", map_tasks, ["extract_task"])
|
|
263
|
+
g.add_edge("extract_task", "validate")
|
|
264
|
+
g.add_edge("validate", END)
|
|
265
|
+
|
|
266
|
+
return g.compile()
|
|
267
|
+
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
|
|
3
4
|
from alayaflow.api import Flow
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
load_dotenv()
|
|
4
7
|
|
|
5
8
|
def main():
|
|
6
9
|
# Initialize flow api singleton
|
|
@@ -10,6 +13,23 @@ def main():
|
|
|
10
13
|
"alayahub_url": "http://your-alayahub-url",
|
|
11
14
|
})
|
|
12
15
|
|
|
16
|
+
api_key = os.getenv("DEEPSEEK_API_KEY")
|
|
17
|
+
if not api_key:
|
|
18
|
+
raise ValueError("请设置环境变量 DEEPSEEK_API_KEY")
|
|
19
|
+
|
|
20
|
+
flow.register_models([
|
|
21
|
+
{
|
|
22
|
+
# Local used fields
|
|
23
|
+
"name": "DeepSeek Chat",
|
|
24
|
+
"model_id": "deepseek-chat",
|
|
25
|
+
"provider_name": "DeepSeek",
|
|
26
|
+
# Connection credentials
|
|
27
|
+
"model_name": "deepseek-chat",
|
|
28
|
+
"base_url": "https://api.deepseek.com/v1",
|
|
29
|
+
"api_key": api_key,
|
|
30
|
+
}
|
|
31
|
+
])
|
|
32
|
+
|
|
13
33
|
workflow_id = 'autotable'
|
|
14
34
|
workflow_version = '1.0.0'
|
|
15
35
|
init_args = {
|
|
@@ -55,6 +75,12 @@ def main():
|
|
|
55
75
|
if final_result:
|
|
56
76
|
print("\n最终结果:")
|
|
57
77
|
print(json.dumps(final_result.get("final_result", {}), indent=2, ensure_ascii=False))
|
|
78
|
+
|
|
79
|
+
# 打印错误信息(如果有)
|
|
80
|
+
errors = final_result.get("errors", {})
|
|
81
|
+
if errors:
|
|
82
|
+
print("\n错误信息:")
|
|
83
|
+
print(json.dumps(errors, indent=2, ensure_ascii=False))
|
|
58
84
|
|
|
59
85
|
if __name__ == '__main__':
|
|
60
86
|
main()
|
|
@@ -2,6 +2,8 @@ import os
|
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
4
|
from alayaflow.api import Flow
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
load_dotenv()
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
def init_flow():
|
|
@@ -15,6 +17,11 @@ def init_flow():
|
|
|
15
17
|
flow.init({
|
|
16
18
|
"alayahub_url": "http://your-alayahub-url",
|
|
17
19
|
})
|
|
20
|
+
|
|
21
|
+
api_key = os.getenv("DEEPSEEK_API_KEY")
|
|
22
|
+
if not api_key:
|
|
23
|
+
raise ValueError("请设置环境变量 DEEPSEEK_API_KEY")
|
|
24
|
+
|
|
18
25
|
flow.register_models([
|
|
19
26
|
{
|
|
20
27
|
# Local used fields
|
|
@@ -24,7 +31,7 @@ def init_flow():
|
|
|
24
31
|
# Connection credentials
|
|
25
32
|
"model_name": "deepseek-chat",
|
|
26
33
|
"base_url": "https://api.deepseek.com/v1",
|
|
27
|
-
"api_key":
|
|
34
|
+
"api_key": api_key,
|
|
28
35
|
}
|
|
29
36
|
])
|
|
30
37
|
|
|
@@ -18,17 +18,14 @@ class LLMComponent:
|
|
|
18
18
|
def __init__(
|
|
19
19
|
self,
|
|
20
20
|
*,
|
|
21
|
-
# ===== 模型 & prompt =====
|
|
22
21
|
model_id: str,
|
|
23
22
|
system_prompt: str,
|
|
24
23
|
prompt: str,
|
|
25
24
|
|
|
26
|
-
# ===== 采样参数 =====
|
|
27
25
|
temperature: Optional[float] = None,
|
|
28
26
|
top_p: Optional[float] = None,
|
|
29
27
|
max_tokens: Optional[int] = None,
|
|
30
28
|
|
|
31
|
-
# ===== 输出控制 =====
|
|
32
29
|
response_format: ResponseFormat = ResponseFormat.TEXT,
|
|
33
30
|
json_schema: Optional[Dict[str, Any]] = None,
|
|
34
31
|
outputs: Optional[Dict[str, str]] = None,
|
|
@@ -47,6 +44,9 @@ class LLMComponent:
|
|
|
47
44
|
self.json_schema = json_schema
|
|
48
45
|
self.outputs = outputs or {}
|
|
49
46
|
self.retry_json_once = retry_json_once
|
|
47
|
+
|
|
48
|
+
# —— 依赖注入(获取全局单例 ModelManager)——
|
|
49
|
+
self._model_manager = ModelManager()
|
|
50
50
|
|
|
51
51
|
def _get_llm(self) -> Runnable:
|
|
52
52
|
bind_kwargs: Dict[str, Any] = {}
|
|
@@ -58,8 +58,7 @@ class LLMComponent:
|
|
|
58
58
|
if self.max_tokens is not None:
|
|
59
59
|
bind_kwargs["max_tokens"] = self.max_tokens
|
|
60
60
|
|
|
61
|
-
|
|
62
|
-
llm = model_manager.get_model(self.model_id, runtime_config=bind_kwargs)
|
|
61
|
+
llm = self._model_manager.get_model(self.model_id, runtime_config=bind_kwargs)
|
|
63
62
|
|
|
64
63
|
return llm
|
|
65
64
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from urllib import request
|
|
2
|
+
from alayaflow.clients.alayamem.http_client import HttpAlayaMemClient
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RetrieveComponent:
|
|
6
|
+
def __init__(self, client: HttpAlayaMemClient):
|
|
7
|
+
self.client = client
|
|
8
|
+
|
|
9
|
+
def __call__(self, query: str, collection_name: str, limit: int = 3) -> list[str]:
|
|
10
|
+
# result = self.client.vdb_query([query], limit, collection_name)
|
|
11
|
+
# return result.get('documents', [[]])[0] if result.get('documents') else []
|
|
12
|
+
docs: list[str] = []
|
|
13
|
+
|
|
14
|
+
# -------- 基础个人信息 --------
|
|
15
|
+
if "姓名" in query:
|
|
16
|
+
docs.extend([
|
|
17
|
+
"用户姓名是张三。",
|
|
18
|
+
"张三,男,1989年出生。",
|
|
19
|
+
])
|
|
20
|
+
|
|
21
|
+
if "性别" in query:
|
|
22
|
+
docs.append("张三,男,工程师。")
|
|
23
|
+
|
|
24
|
+
if "年龄" in query:
|
|
25
|
+
docs.append("张三,1989年出生,今年34岁。")
|
|
26
|
+
|
|
27
|
+
# -------- 联系方式 --------
|
|
28
|
+
if "电话" in query:
|
|
29
|
+
docs.extend([
|
|
30
|
+
"联系电话:15643431212。",
|
|
31
|
+
"手机号为15643431212,可随时联系。",
|
|
32
|
+
])
|
|
33
|
+
|
|
34
|
+
if "邮箱" in query or "Email" in query:
|
|
35
|
+
docs.extend([
|
|
36
|
+
"电子邮箱:273230101@qq.com。",
|
|
37
|
+
"邮箱地址为 27223221@qq.com。",
|
|
38
|
+
])
|
|
39
|
+
|
|
40
|
+
# -------- 工作单位 / 职务 --------
|
|
41
|
+
if "工作单位" in query or "公司" in query:
|
|
42
|
+
docs.extend([
|
|
43
|
+
"现就职于 Google 中国。",
|
|
44
|
+
"工作单位为 Google。",
|
|
45
|
+
])
|
|
46
|
+
|
|
47
|
+
if "职务" in query or "职位" in query:
|
|
48
|
+
docs.append("现任高级软件工程师。")
|
|
49
|
+
|
|
50
|
+
# -------- 教育背景 --------
|
|
51
|
+
if "学历" in query or "教育" in query:
|
|
52
|
+
docs.extend([
|
|
53
|
+
"2016年本科毕业于清华大学计算机科学与技术专业。",
|
|
54
|
+
"2020年博士毕业于北京大学。",
|
|
55
|
+
])
|
|
56
|
+
|
|
57
|
+
# -------- 工作 / 学习经历 --------
|
|
58
|
+
if "学习工作经历" in query or "经历" in query:
|
|
59
|
+
docs.extend([
|
|
60
|
+
"2010.09-2014.07 就读于清华大学计算机科学与技术专业。",
|
|
61
|
+
"2014.07-2016.08 在百度公司担任软件开发工程师。",
|
|
62
|
+
"2016.09 至今在 Google 担任高级软件工程师。",
|
|
63
|
+
])
|
|
64
|
+
|
|
65
|
+
# -------- 路径/父字段增强(模拟更真实检索)--------
|
|
66
|
+
if "个人信息" in query and not docs:
|
|
67
|
+
docs.append("张三,男,1989年出生,工程师,现居北京。")
|
|
68
|
+
|
|
69
|
+
if "联系方式" in query and not docs:
|
|
70
|
+
docs.append("联系方式:电话 15643431212,邮箱 272040101@qq.com。")
|
|
71
|
+
|
|
72
|
+
# -------- 兜底:模拟找不到 --------
|
|
73
|
+
if not docs:
|
|
74
|
+
return []
|
|
75
|
+
|
|
76
|
+
# limit 截断
|
|
77
|
+
return docs[:limit]
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"id": "autotable",
|
|
3
|
-
"name": "RAG 并发信息抽取工作流",
|
|
4
|
-
"description": "基于 LangGraph Map-Reduce 架构的高性能抽取流程。集成信号量限流(Semaphore)、JSON 结构化校验、文档截断及错误兜底机制。",
|
|
5
|
-
"version": "1.0.0",
|
|
6
|
-
"tags": ["rag", "extraction", "langgraph", "json-mode"],
|
|
7
|
-
"entry_file": "workflow.py",
|
|
8
|
-
"entry_point": "create_graph"
|
|
9
|
-
}
|
|
@@ -1,400 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import Any, Dict, List, Optional, TypedDict, Annotated, Union, TypeAlias, Tuple
|
|
4
|
-
from collections import defaultdict
|
|
5
|
-
from threading import Semaphore
|
|
6
|
-
|
|
7
|
-
from langgraph.graph import StateGraph, START, END
|
|
8
|
-
from langgraph.types import Send
|
|
9
|
-
from langchain_core.runnables import RunnableConfig
|
|
10
|
-
|
|
11
|
-
from alayaflow.component.llm_node import LLMComponent, ResponseFormat
|
|
12
|
-
from alayaflow.clients.alayamem.http_client import HttpAlayaMemClient
|
|
13
|
-
from alayaflow.component.retrieve_node import RetrieveComponent
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
FieldSpec: TypeAlias = Union[str, Dict[str, List["FieldSpec"]]] # 递归:dict -> list[FieldSpec]
|
|
17
|
-
|
|
18
|
-
def merge_dicts(a: Dict, b: Dict) -> Dict:
|
|
19
|
-
return {**a, **b}
|
|
20
|
-
|
|
21
|
-
def deep_merge(a: Dict[str, Any], b: Dict[str, Any]) -> Dict[str, Any]:
|
|
22
|
-
out = dict(a or {})
|
|
23
|
-
for k, v in (b or {}).items():
|
|
24
|
-
if k in out and isinstance(out[k], dict) and isinstance(v, dict):
|
|
25
|
-
out[k] = deep_merge(out[k], v)
|
|
26
|
-
else:
|
|
27
|
-
out[k] = v
|
|
28
|
-
return out
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@dataclass(frozen=True)
|
|
32
|
-
class GroupTask:
|
|
33
|
-
path: Tuple[str, ...] # 父路径,如 ("个人信息","联系方式");根为 ()
|
|
34
|
-
keys: Tuple[str, ...] # 该路径下需要抽取的叶子字段名
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class OverallState(TypedDict):
|
|
38
|
-
fields: List[FieldSpec] # 输入模板(递归)
|
|
39
|
-
tasks: List[GroupTask] # 规划出的任务列表
|
|
40
|
-
|
|
41
|
-
# 调试信息:每个任务的检索片段
|
|
42
|
-
context_by_task: Annotated[Dict[str, List[str]], merge_dicts]
|
|
43
|
-
|
|
44
|
-
# 最终值树:通过 deep_merge reducer 并发合并 patch
|
|
45
|
-
final_result: Annotated[Dict[str, Any], deep_merge]
|
|
46
|
-
|
|
47
|
-
errors: Annotated[Dict[str, str], merge_dicts]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class TaskState(TypedDict):
|
|
51
|
-
task: GroupTask
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def _as_list(x: Any) -> List[Any]:
|
|
56
|
-
if x is None:
|
|
57
|
-
return []
|
|
58
|
-
if isinstance(x, list):
|
|
59
|
-
return x
|
|
60
|
-
return [x]
|
|
61
|
-
|
|
62
|
-
def flatten_leaf_tasks(specs: List[FieldSpec], base_path: Optional[List[str]] = None) -> List[Tuple[Tuple[str, ...], str]]:
|
|
63
|
-
"""
|
|
64
|
-
返回:[(path_tuple, leaf_key), ...]
|
|
65
|
-
"""
|
|
66
|
-
base_path = base_path or []
|
|
67
|
-
out: List[Tuple[Tuple[str, ...], str]] = []
|
|
68
|
-
|
|
69
|
-
for item in specs or []:
|
|
70
|
-
if isinstance(item, str):
|
|
71
|
-
out.append((tuple(base_path), item))
|
|
72
|
-
continue
|
|
73
|
-
|
|
74
|
-
if isinstance(item, dict):
|
|
75
|
-
for parent, children in item.items():
|
|
76
|
-
for child in _as_list(children):
|
|
77
|
-
if isinstance(child, str):
|
|
78
|
-
out.append((tuple(base_path + [parent]), child))
|
|
79
|
-
elif isinstance(child, dict):
|
|
80
|
-
out.extend(flatten_leaf_tasks([child], base_path + [parent]))
|
|
81
|
-
else:
|
|
82
|
-
pass
|
|
83
|
-
continue
|
|
84
|
-
|
|
85
|
-
return out
|
|
86
|
-
|
|
87
|
-
def plan_node(state: OverallState, config: RunnableConfig):
|
|
88
|
-
leaf = flatten_leaf_tasks(state["fields"])
|
|
89
|
-
grouped: Dict[Tuple[str, ...], List[str]] = defaultdict(list)
|
|
90
|
-
for path, key in leaf:
|
|
91
|
-
grouped[path].append(key)
|
|
92
|
-
|
|
93
|
-
tasks: List[GroupTask] = []
|
|
94
|
-
for path, keys in grouped.items():
|
|
95
|
-
# 去重保持顺序
|
|
96
|
-
seen = set()
|
|
97
|
-
uniq = []
|
|
98
|
-
for k in keys:
|
|
99
|
-
if k not in seen:
|
|
100
|
-
seen.add(k)
|
|
101
|
-
uniq.append(k)
|
|
102
|
-
tasks.append(GroupTask(path=path, keys=tuple(uniq)))
|
|
103
|
-
|
|
104
|
-
# 可选:让任务顺序稳定(不影响并发结果,只影响日志观感)
|
|
105
|
-
tasks.sort(key=lambda t: (len(t.path), t.path))
|
|
106
|
-
return {"tasks": tasks}
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def map_tasks(state: OverallState):
|
|
110
|
-
return [Send("extract_task", {"task": t}) for t in state["tasks"]]
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def make_patch(path: Tuple[str, ...], kv: Dict[str, str]) -> Dict[str, Any]:
|
|
115
|
-
"""
|
|
116
|
-
path=("个人信息","联系方式"), kv={"电话":"..","邮箱":".."} =>
|
|
117
|
-
{"个人信息":{"联系方式":{"电话":"..","邮箱":".."}}}
|
|
118
|
-
"""
|
|
119
|
-
node: Dict[str, Any] = dict(kv)
|
|
120
|
-
for p in reversed(path):
|
|
121
|
-
node = {p: node}
|
|
122
|
-
return node
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def build_system_prompt(keys: list[str]) -> str:
|
|
127
|
-
keys_str = ", ".join(keys)
|
|
128
|
-
|
|
129
|
-
return f"""
|
|
130
|
-
你是一个严谨的“局部字段抽取器”(table patch extractor)。
|
|
131
|
-
|
|
132
|
-
你的任务是:**只为指定字段抽取值**,严格依据提供的知识片段,不得猜测或编造。
|
|
133
|
-
|
|
134
|
-
通用规则:
|
|
135
|
-
1. 输出必须是严格合法 JSON,不允许包含解释、Markdown、代码块或多余文本。
|
|
136
|
-
2. **只允许输出以下字段(不多不少)**:{keys_str}
|
|
137
|
-
3. 所有字段值必须是字符串。
|
|
138
|
-
4. 找不到 / 不确定 / 空值 / 占位符 → 必须输出空字符串 ""。
|
|
139
|
-
5. 字段名可能存在空格或轻微变体(如“姓 名”≈“姓名”),允许智能匹配,但不得扩展到未指定字段。
|
|
140
|
-
|
|
141
|
-
长文本字段格式规则(必须遵守):
|
|
142
|
-
- 当字段内容包含**多个条目、多个时间段或多段经历**时:
|
|
143
|
-
- 必须使用序号列表格式。
|
|
144
|
-
- **每个条目占一行,条目之间必须使用 "\n" 换行符分隔。**
|
|
145
|
-
- 不允许使用分号、顿号、逗号等方式合并多个条目到同一行。
|
|
146
|
-
- 示例正确格式:
|
|
147
|
-
"1.第一条内容\n2.第二条内容\n3.第三条内容"
|
|
148
|
-
|
|
149
|
-
表格单元格理解规则(重要):
|
|
150
|
-
- 知识片段可能来自表格,每行使用 " | " 分隔单元格。
|
|
151
|
-
- "<空>" 表示空单元格,对应值为 ""。
|
|
152
|
-
- 字段名后不一定是值:
|
|
153
|
-
- 若字段名后是 "<空>" → 值为 ""。
|
|
154
|
-
- 若字段名后是另一个字段名 → 继续向后寻找第一个“非字段名 / 非占位符”的单元格作为值。
|
|
155
|
-
- 示例:"字段A | 字段B | 值" → 字段A="", 字段B="值"。
|
|
156
|
-
|
|
157
|
-
占位符识别:
|
|
158
|
-
- 若候选值是模板占位符或签字日期类文本
|
|
159
|
-
(如“签字: 年 月 日”“学院盖章: 年 月 日”等),必须返回 ""。
|
|
160
|
-
""".strip()
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def build_user_prompt(
|
|
164
|
-
content_text: str,
|
|
165
|
-
path: list[str],
|
|
166
|
-
keys: list[str],
|
|
167
|
-
) -> str:
|
|
168
|
-
path_str = " / ".join(path) if path else "<root>"
|
|
169
|
-
keys_str = ", ".join(keys)
|
|
170
|
-
|
|
171
|
-
json_skeleton = "{\n" + ",\n".join([f' "{k}": ""' for k in keys]) + "\n}"
|
|
172
|
-
|
|
173
|
-
return f"""
|
|
174
|
-
【本次任务定位】
|
|
175
|
-
字段路径(仅用于语义定位,不要输出):{path_str}
|
|
176
|
-
需要抽取的字段:{keys_str}
|
|
177
|
-
|
|
178
|
-
【知识库片段】
|
|
179
|
-
{content_text}
|
|
180
|
-
|
|
181
|
-
【输出要求】
|
|
182
|
-
- 只输出一个 JSON 对象
|
|
183
|
-
- key 必须严格为:{keys_str}
|
|
184
|
-
- 无法确定 / 空值 / 占位符 → 输出 ""
|
|
185
|
-
|
|
186
|
-
【JSON 输出模板】
|
|
187
|
-
{json_skeleton}
|
|
188
|
-
""".strip()
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def create_extract_task_node(
|
|
192
|
-
client: HttpAlayaMemClient,
|
|
193
|
-
*,
|
|
194
|
-
max_concurrency: int = 10,
|
|
195
|
-
top_k: int = 5,
|
|
196
|
-
max_doc_chars: int = 400,
|
|
197
|
-
):
|
|
198
|
-
limiter = Semaphore(max_concurrency)
|
|
199
|
-
|
|
200
|
-
def slim_docs(docs: List[str]) -> List[str]:
|
|
201
|
-
out = []
|
|
202
|
-
for d in docs or []:
|
|
203
|
-
s = str(d)
|
|
204
|
-
if len(s) > max_doc_chars:
|
|
205
|
-
s = s[:max_doc_chars] + "…"
|
|
206
|
-
out.append(s)
|
|
207
|
-
return out
|
|
208
|
-
|
|
209
|
-
def node(state: TaskState, config: RunnableConfig):
|
|
210
|
-
task = state["task"]
|
|
211
|
-
path = task.path
|
|
212
|
-
keys = list(task.keys)
|
|
213
|
-
|
|
214
|
-
task_id = f"{'/'.join(path) or '<root>'}:{','.join(keys)}"
|
|
215
|
-
|
|
216
|
-
# 默认 patch:保证结构稳定(缺失也填空)
|
|
217
|
-
default_kv = {k: "" for k in keys}
|
|
218
|
-
default_patch = make_patch(path, default_kv)
|
|
219
|
-
|
|
220
|
-
try:
|
|
221
|
-
with limiter:
|
|
222
|
-
# 从 config 中获取 collection_name(运行时参数)
|
|
223
|
-
config_dict = config.get("configurable", {}) if isinstance(config, dict) else {}
|
|
224
|
-
collection_name = config_dict.get("collection_name", "file_watcher_collection")
|
|
225
|
-
|
|
226
|
-
# 1) 检索 query:路径信息 + keys
|
|
227
|
-
# path 越深,越应该把上层标题带进去提升命中
|
|
228
|
-
query_parts = list(path) + keys
|
|
229
|
-
query = ";".join([p for p in query_parts if p])
|
|
230
|
-
|
|
231
|
-
retrieve_component = RetrieveComponent(client=client)
|
|
232
|
-
docs = retrieve_component(query=query, collection_name=collection_name, limit=top_k)
|
|
233
|
-
docs = slim_docs(docs)
|
|
234
|
-
|
|
235
|
-
# 没 docs:直接返回默认
|
|
236
|
-
if not docs:
|
|
237
|
-
return {
|
|
238
|
-
"context_by_task": {task_id: []},
|
|
239
|
-
"final_result": default_patch,
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
formatted_context = "\n\n".join(
|
|
243
|
-
[f"片段 {i+1}: {doc}" for i, doc in enumerate(docs)]
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
# 2) 一次性抽取 keys(严格 JSON object)
|
|
247
|
-
json_skeleton = "{\n" + ",\n".join([f' "{k}": ""' for k in keys]) + "\n}"
|
|
248
|
-
|
|
249
|
-
system_prompt = build_system_prompt(keys)
|
|
250
|
-
|
|
251
|
-
user_prompt = build_user_prompt(formatted_context, path, keys)
|
|
252
|
-
|
|
253
|
-
llm = LLMComponent(
|
|
254
|
-
model_name="deepseek-chat",
|
|
255
|
-
system_prompt=system_prompt,
|
|
256
|
-
prompt=user_prompt,
|
|
257
|
-
response_format=ResponseFormat.JSON,
|
|
258
|
-
temperature=0.0,
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
msg = llm()
|
|
262
|
-
obj = json.loads(msg.content)
|
|
263
|
-
|
|
264
|
-
extracted = {}
|
|
265
|
-
for k in keys:
|
|
266
|
-
v = obj.get(k, "")
|
|
267
|
-
extracted[k] = (str(v).strip() if v is not None else "")
|
|
268
|
-
|
|
269
|
-
patch = make_patch(path, extracted)
|
|
270
|
-
|
|
271
|
-
return {
|
|
272
|
-
"context_by_task": {task_id: docs},
|
|
273
|
-
"final_result": patch,
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
except Exception as e:
|
|
277
|
-
return {
|
|
278
|
-
"context_by_task": {task_id: []},
|
|
279
|
-
"final_result": default_patch,
|
|
280
|
-
"errors": {task_id: f"{type(e).__name__}: {e}"},
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
return node
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def validate_node(state: OverallState, config: RunnableConfig):
|
|
288
|
-
# 简单缺失检查:把 tasks 展开期望字段,看看 final_result 是否为空
|
|
289
|
-
res = state.get("final_result", {}) or {}
|
|
290
|
-
missing = []
|
|
291
|
-
|
|
292
|
-
def get_in(d: Dict[str, Any], path: Tuple[str, ...]) -> Dict[str, Any]:
|
|
293
|
-
cur = d
|
|
294
|
-
for p in path:
|
|
295
|
-
if not isinstance(cur, dict):
|
|
296
|
-
return {}
|
|
297
|
-
cur = cur.get(p, {})
|
|
298
|
-
return cur if isinstance(cur, dict) else {}
|
|
299
|
-
|
|
300
|
-
for t in state["tasks"]:
|
|
301
|
-
scope = get_in(res, t.path)
|
|
302
|
-
for k in t.keys:
|
|
303
|
-
if not str(scope.get(k, "")).strip():
|
|
304
|
-
missing.append((".".join(t.path + (k,))) if t.path else k)
|
|
305
|
-
|
|
306
|
-
if missing:
|
|
307
|
-
return {"errors": {"__missing__": ";".join(missing)}}
|
|
308
|
-
return {}
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
# -------------------------
|
|
312
|
-
# Build graph
|
|
313
|
-
# -------------------------
|
|
314
|
-
def create_graph(init_args: Dict[str, Any]):
|
|
315
|
-
client = HttpAlayaMemClient(init_args["alayamem_url"])
|
|
316
|
-
g = StateGraph(OverallState)
|
|
317
|
-
|
|
318
|
-
g.add_node("plan", plan_node)
|
|
319
|
-
g.add_node("extract_task", create_extract_task_node(client, max_concurrency=10, top_k=3))
|
|
320
|
-
g.add_node("validate", validate_node)
|
|
321
|
-
|
|
322
|
-
g.add_edge(START, "plan")
|
|
323
|
-
g.add_conditional_edges("plan", map_tasks, ["extract_task"])
|
|
324
|
-
g.add_edge("extract_task", "validate")
|
|
325
|
-
g.add_edge("validate", END)
|
|
326
|
-
|
|
327
|
-
return g.compile()
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
if __name__ == "__main__":
|
|
331
|
-
app = create_graph({"alayamem_url": "http://10.16.70.46:5555"})
|
|
332
|
-
|
|
333
|
-
input_data: OverallState = {
|
|
334
|
-
"fields": [
|
|
335
|
-
{
|
|
336
|
-
"申请人信息": [
|
|
337
|
-
"姓名",
|
|
338
|
-
"性别",
|
|
339
|
-
"出生年月",
|
|
340
|
-
"民族",
|
|
341
|
-
"学位",
|
|
342
|
-
"职称",
|
|
343
|
-
"是否在站博士后",
|
|
344
|
-
"电子邮箱",
|
|
345
|
-
"办公电话",
|
|
346
|
-
"国别或地区",
|
|
347
|
-
"申请人类别",
|
|
348
|
-
"工作单位",
|
|
349
|
-
"主要研究领域"
|
|
350
|
-
]
|
|
351
|
-
},
|
|
352
|
-
{
|
|
353
|
-
"依托单位信息": [
|
|
354
|
-
"名称",
|
|
355
|
-
"联系人",
|
|
356
|
-
"电子邮箱",
|
|
357
|
-
"电话",
|
|
358
|
-
"网站地址"
|
|
359
|
-
]
|
|
360
|
-
},
|
|
361
|
-
{
|
|
362
|
-
"合作研究单位信息": [
|
|
363
|
-
"单位名称"
|
|
364
|
-
]
|
|
365
|
-
},
|
|
366
|
-
{
|
|
367
|
-
"项目基本信息": [
|
|
368
|
-
"项目名称",
|
|
369
|
-
"英文名称",
|
|
370
|
-
"资助类别",
|
|
371
|
-
"亚类说明",
|
|
372
|
-
"附注说明",
|
|
373
|
-
"申请代码",
|
|
374
|
-
"研究期限",
|
|
375
|
-
"研究方向",
|
|
376
|
-
"申请资助经费",
|
|
377
|
-
"研究属性",
|
|
378
|
-
"中文关键词",
|
|
379
|
-
"英文关键词"
|
|
380
|
-
]
|
|
381
|
-
},
|
|
382
|
-
"中文摘要",
|
|
383
|
-
"英文摘要"
|
|
384
|
-
],
|
|
385
|
-
"tasks": [],
|
|
386
|
-
"context_by_task": {},
|
|
387
|
-
"final_result": {},
|
|
388
|
-
"errors": {},
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
config = {
|
|
392
|
-
"configurable": {
|
|
393
|
-
"collection_name": "file_watcher_collection",
|
|
394
|
-
}
|
|
395
|
-
}
|
|
396
|
-
out = app.invoke(input_data, config=config)
|
|
397
|
-
print("final_result:")
|
|
398
|
-
print(json.dumps(out["final_result"], ensure_ascii=False, indent=2))
|
|
399
|
-
print("\nerrors:")
|
|
400
|
-
print(json.dumps(out["errors"], ensure_ascii=False, indent=2))
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
from urllib import request
|
|
2
|
-
from alayaflow.clients.alayamem.http_client import HttpAlayaMemClient
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class RetrieveComponent:
|
|
6
|
-
def __init__(self, client: HttpAlayaMemClient):
|
|
7
|
-
self.client = client
|
|
8
|
-
|
|
9
|
-
def __call__(self, query: str, collection_name: str, limit: int = 3) -> list[str]:
|
|
10
|
-
result = self.client.vdb_query([query], limit, collection_name)
|
|
11
|
-
return result.get('documents', [[]])[0] if result.get('documents') else []
|
{alayaflow-0.1.2 → alayaflow-0.1.3}/.alaya.ai/alayaflow/workflows/simple_chat/1.0.0/metadata.json
RENAMED
|
File without changes
|
{alayaflow-0.1.2 → alayaflow-0.1.3}/.alaya.ai/alayaflow/workflows/simple_chat/1.0.0/metadata.py
RENAMED
|
File without changes
|
|
File without changes
|
{alayaflow-0.1.2 → alayaflow-0.1.3}/.alaya.ai/alayaflow/workflows/simple_chat/1.0.0/schemas.py
RENAMED
|
File without changes
|
{alayaflow-0.1.2 → alayaflow-0.1.3}/.alaya.ai/alayaflow/workflows/simple_chat/1.0.0/workflow.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/runnable/base_runnable_workflow.py
RENAMED
|
File without changes
|
{alayaflow-0.1.2 → alayaflow-0.1.3}/src/alayaflow/workflow/runnable/state_graph_runnable_workflow.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|