pycityagent 2.0.0a21__py3-none-any.whl → 2.0.0a24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,136 +0,0 @@
1
- """简单的基于内存的embedding实现"""
2
-
3
- import numpy as np
4
- import hashlib
5
- import json
6
-
7
-
8
- class SimpleEmbedding:
9
- """简单的基于内存的embedding实现
10
-
11
- 使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
12
- 所有向量都保存在内存中,适用于小规模应用。
13
- """
14
-
15
- def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
16
- """初始化
17
-
18
- Args:
19
- vector_dim: 向量维度
20
- cache_size: 缓存大小,超过此大小将清除最早的缓存
21
- """
22
- self.vector_dim = vector_dim
23
- self.cache_size = cache_size
24
- self._cache: dict[str, np.ndarray] = {}
25
- self._vocab: dict[str, int] = {} # 词汇表
26
- self._idf: dict[str, float] = {} # 逆文档频率
27
- self._doc_count = 0 # 文档总数
28
-
29
- def _text_to_hash(self, text: str) -> str:
30
- """将文本转换为hash值"""
31
- return hashlib.md5(text.encode()).hexdigest()
32
-
33
- def _tokenize(self, text: str) -> list[str]:
34
- """简单的分词"""
35
- # 这里使用简单的空格分词,实际应用中可以使用更复杂的分词方法
36
- return text.lower().split()
37
-
38
- def _update_vocab(self, tokens: list[str]):
39
- """更新词汇表"""
40
- for token in set(tokens): # 使用set去重
41
- if token not in self._vocab:
42
- self._vocab[token] = len(self._vocab)
43
-
44
- def _update_idf(self, tokens: list[str]):
45
- """更新IDF值"""
46
- self._doc_count += 1
47
- unique_tokens = set(tokens)
48
- for token in unique_tokens:
49
- self._idf[token] = self._idf.get(token, 0) + 1
50
-
51
- def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
52
- """计算词频(TF)"""
53
- tf = {}
54
- total_tokens = len(tokens)
55
- for token in tokens:
56
- tf[token] = tf.get(token, 0) + 1
57
- # 归一化
58
- for token in tf:
59
- tf[token] /= total_tokens
60
- return tf
61
-
62
- def _calculate_tfidf(self, tokens: list[str]) -> np.ndarray:
63
- """计算TF-IDF向量"""
64
- vector = np.zeros(self.vector_dim)
65
- tf = self._calculate_tf(tokens)
66
-
67
- for token, tf_value in tf.items():
68
- if token in self._idf:
69
- idf = np.log(self._doc_count / self._idf[token])
70
- idx = self._vocab[token] % self.vector_dim # 使用取模运算来控制向量维度
71
- vector[idx] += tf_value * idf
72
-
73
- # L2归一化
74
- norm = np.linalg.norm(vector)
75
- if norm > 0:
76
- vector /= norm
77
-
78
- return vector
79
-
80
- async def embed(self, text: str) -> np.ndarray:
81
- """生成文本的向量表示
82
-
83
- Args:
84
- text: 输入文本
85
-
86
- Returns:
87
- np.ndarray: 文本的向量表示
88
- """
89
- # 检查缓存
90
- text_hash = self._text_to_hash(text)
91
- if text_hash in self._cache:
92
- return self._cache[text_hash]
93
-
94
- # 分词
95
- tokens = self._tokenize(text)
96
- if not tokens:
97
- return np.zeros(self.vector_dim)
98
-
99
- # 更新词汇表和IDF
100
- self._update_vocab(tokens)
101
- self._update_idf(tokens)
102
-
103
- # 计算向量
104
- vector = self._calculate_tfidf(tokens)
105
-
106
- # 更新缓存
107
- if len(self._cache) >= self.cache_size:
108
- # 删除最早的缓存
109
- oldest_key = next(iter(self._cache))
110
- del self._cache[oldest_key]
111
- self._cache[text_hash] = vector
112
-
113
- return vector
114
-
115
- def save(self, file_path: str):
116
- """保存模型"""
117
- state = {
118
- "vector_dim": self.vector_dim,
119
- "cache_size": self.cache_size,
120
- "vocab": self._vocab,
121
- "idf": self._idf,
122
- "doc_count": self._doc_count,
123
- }
124
- with open(file_path, "w") as f:
125
- json.dump(state, f)
126
-
127
- def load(self, file_path: str):
128
- """加载模型"""
129
- with open(file_path, "r") as f:
130
- state = json.load(f)
131
- self.vector_dim = state["vector_dim"]
132
- self.cache_size = state["cache_size"]
133
- self._vocab = state["vocab"]
134
- self._idf = state["idf"]
135
- self._doc_count = state["doc_count"]
136
- self._cache = {} # 清空缓存