079project 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/GroupStarter.cjs +647 -0
- package/LICENSE +165 -0
- package/PropagateSignalUseJsWorker.js +92 -0
- package/README.md +102 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/README.md +52 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/README.zh_CN.md +59 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/RedisService.exe +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/cygcrypto-3.dll +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/cyggcc_s-seh-1.dll +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/cygssl-3.dll +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/cygstdc++-6.dll +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/cygwin1.dll +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/cygz.dll +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/dump.rdb +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/install_redis_service.bat +100 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/redis-benchmark.exe +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/redis-check-aof.exe +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/redis-check-rdb.exe +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/redis-cli.exe +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/redis-full.conf +376 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/redis-sentinel.exe +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/redis-server.exe +0 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/redis.conf +2348 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/sentinel.conf +361 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/start.bat +4 -0
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/uninstall_redis_service.bat +30 -0
- package/boot.py +51 -0
- package/chat_Client.js +29 -0
- package/controller.cjs +118 -0
- package/enhancedForwarder.js +378 -0
- package/forwarder.js +1456 -0
- package/groupmanager.cjs +143 -0
- package/howToStart.txt +8 -0
- package/lemma.csv +210 -0
- package/load.py +35 -0
- package/mainManager.cjs +81 -0
- package/mainStarter.cjs +535 -0
- package/main_Serve.cjs +2745 -0
- package/main_Study.cjs +3230 -0
- package/memeMergeWorker.cjs +55 -0
- package/model_RNN.py +117 -0
- package/note.txt +5 -0
- package/notebook.txt +8 -0
- package/npminstall-debug.log +206 -0
- package/package.json +48 -0
- package/public/chat_straight.html +90 -0
- package/public/index.html +247 -0
- package/public/indexmain.html +136 -0
- package/public/monitor.html +194 -0
- package/robots/wikitext-something.txt +25 -0
- package/runtime.proto +24 -0
- package/runtime_data.json +766294 -0
- package/serializer_seq2seq.h5 +0 -0
- package/start.js +46 -0
- package/tests/test_FIrststep1.txt +1224 -0
- package/tests/test_FIrststep2.txt +2956 -0
- package/tests/test_FIrststep3.txt +1224 -0
- package/tests/test_FIrststep4.txt +1396 -0
- package/tests/test_FIrststep5.txt +2852 -0
- package/tests/test_FIrststep6.txt +1516 -0
- package/tests/test_FirstStep7.txt +1748 -0
- package/tests/test_Firstsetp8.txt +2672 -0
- package/tokenizer.json +1 -0
- package/vocabularySplitter.js +253 -0
- package/wikitext/.gitattributes +27 -0
- package/wikitext/README.md +344 -0
- package/wikitext/describtion.txt +1 -0
package/tokenizer.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"class_name": "Tokenizer", "config": {"num_words": 5000, "filters": "", "lower": true, "split": " ", "char_level": false, "oov_token": "<UNK>", "document_count": 20, "word_counts": "{\"i\": 2, \"person\": 2, \"a\": 8, \"am\": 2, \"are\": 6, \"student\": 2, \"you\": 2, \"he\": 2, \"is\": 12, \"teacher\": 2, \"my\": 4, \"she\": 2, \"friend\": 2, \"this\": 2, \"book\": 2, \"dog\": 2, \"that\": 2, \"happy\": 2, \"we\": 2, \"they\": 2, \"here\": 2, \"it\": 2, \"raining\": 2, \"cat\": 2, \"the\": 2, \"black\": 2}", "word_docs": "{\"am\": 2, \"person\": 2, \"i\": 2, \"a\": 8, \"you\": 2, \"are\": 6, \"student\": 2, \"he\": 2, \"teacher\": 2, \"is\": 12, \"my\": 4, \"friend\": 2, \"she\": 2, \"this\": 2, \"book\": 2, \"dog\": 2, \"that\": 2, \"happy\": 2, \"we\": 2, \"they\": 2, \"here\": 2, \"raining\": 2, \"it\": 2, \"the\": 2, \"black\": 2, \"cat\": 2}", "index_docs": "{\"8\": 2, \"7\": 2, \"6\": 2, \"3\": 8, \"10\": 2, \"4\": 6, \"9\": 2, \"11\": 2, \"12\": 2, \"2\": 12, \"5\": 4, \"14\": 2, \"13\": 2, \"15\": 2, \"16\": 2, \"17\": 2, \"18\": 2, \"19\": 2, \"20\": 2, \"21\": 2, \"22\": 2, \"24\": 2, \"23\": 2, \"26\": 2, \"27\": 2, \"25\": 2}", "index_word": "{\"1\": \"<UNK>\", \"2\": \"is\", \"3\": \"a\", \"4\": \"are\", \"5\": \"my\", \"6\": \"i\", \"7\": \"person\", \"8\": \"am\", \"9\": \"student\", \"10\": \"you\", \"11\": \"he\", \"12\": \"teacher\", \"13\": \"she\", \"14\": \"friend\", \"15\": \"this\", \"16\": \"book\", \"17\": \"dog\", \"18\": \"that\", \"19\": \"happy\", \"20\": \"we\", \"21\": \"they\", \"22\": \"here\", \"23\": \"it\", \"24\": \"raining\", \"25\": \"cat\", \"26\": \"the\", \"27\": \"black\"}", "word_index": "{\"<UNK>\": 1, \"is\": 2, \"a\": 3, \"are\": 4, \"my\": 5, \"i\": 6, \"person\": 7, \"am\": 8, \"student\": 9, \"you\": 10, \"he\": 11, \"teacher\": 12, \"she\": 13, \"friend\": 14, \"this\": 15, \"book\": 16, \"dog\": 17, \"that\": 18, \"happy\": 19, \"we\": 20, \"they\": 21, \"here\": 22, \"it\": 23, \"raining\": 24, \"cat\": 25, \"the\": 26, \"black\": 27}"}}
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
const axios = require('axios');
|
|
4
|
+
const readline = require('readline');
|
|
5
|
+
const { createHash } = require('crypto');
|
|
6
|
+
const cluster = require('cluster');
|
|
7
|
+
const os = require('os');
|
|
8
|
+
|
|
9
|
+
// 配置
|
|
10
|
+
const VOCAB_FILES = [
|
|
11
|
+
'./vocabulary.csv',
|
|
12
|
+
'./extended_vocab.csv',
|
|
13
|
+
'./domain_vocab.csv'
|
|
14
|
+
];
|
|
15
|
+
const SHARD_SIZE = 100 * 1024; // 100KB
|
|
16
|
+
const OUTPUT_DIR = path.join(__dirname, 'vocabShards');
|
|
17
|
+
const EMBEDDINGS_API = process.env.EMBEDDINGS_API || 'http://localhost:5000/api/embed';
|
|
18
|
+
const DOMAINS = [
|
|
19
|
+
'general', 'science', 'technology', 'medicine', 'engineering',
|
|
20
|
+
'art', 'history', 'geography', 'music', 'literature',
|
|
21
|
+
'business', 'finance', 'sports', 'politics', 'education',
|
|
22
|
+
'psychology', 'philosophy', 'religion', 'food', 'travel'
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
// 确保输出目录存在
|
|
26
|
+
if (!fs.existsSync(OUTPUT_DIR)) {
|
|
27
|
+
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
async function getWordEmbedding(word) {
|
|
31
|
+
try {
|
|
32
|
+
const response = await axios.post(EMBEDDINGS_API, { text: word });
|
|
33
|
+
return response.data.embedding;
|
|
34
|
+
} catch (error) {
|
|
35
|
+
console.error(`获取词嵌入失败: ${word}`, error.message);
|
|
36
|
+
// 回退策略: 生成基于词的哈希的伪嵌入
|
|
37
|
+
const hash = createHash('sha256').update(word).digest('hex');
|
|
38
|
+
const pseudoEmbedding = Array.from({ length: 32 }, (_, i) =>
|
|
39
|
+
parseInt(hash.substring(i*2, i*2+2), 16) / 255 - 0.5
|
|
40
|
+
);
|
|
41
|
+
return pseudoEmbedding;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async function loadAllVocabulary() {
|
|
46
|
+
console.log('加载所有词表...');
|
|
47
|
+
const allWords = new Set();
|
|
48
|
+
|
|
49
|
+
for (const file of VOCAB_FILES) {
|
|
50
|
+
const filePath = path.join(__dirname, file);
|
|
51
|
+
if (!fs.existsSync(filePath)) {
|
|
52
|
+
console.warn(`警告: 词表文件不存在 ${filePath}`);
|
|
53
|
+
continue;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const fileStream = fs.createReadStream(filePath);
|
|
57
|
+
const rl = readline.createInterface({
|
|
58
|
+
input: fileStream,
|
|
59
|
+
crlfDelay: Infinity
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
for await (const line of rl) {
|
|
63
|
+
const word = line.trim().split(',')[0]; // 假设第一列是词
|
|
64
|
+
if (word && word.length > 1) {
|
|
65
|
+
allWords.add(word);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
console.log(`总共加载了 ${allWords.size} 个唯一词条`);
|
|
71
|
+
return Array.from(allWords);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async function assignWordsToDomains(words) {
|
|
75
|
+
console.log('将词语分配到领域...');
|
|
76
|
+
const domainWords = {};
|
|
77
|
+
DOMAINS.forEach(domain => { domainWords[domain] = []; });
|
|
78
|
+
|
|
79
|
+
// 如果词太多,简单起见,我们可以按字母哈希分配
|
|
80
|
+
if (words.length > 10000) {
|
|
81
|
+
console.log('词条太多,使用简单哈希进行分配...');
|
|
82
|
+
words.forEach(word => {
|
|
83
|
+
const hash = word.charCodeAt(0) % DOMAINS.length;
|
|
84
|
+
domainWords[DOMAINS[hash]].push(word);
|
|
85
|
+
});
|
|
86
|
+
} else {
|
|
87
|
+
// 对于较小的词表,我们可以尝试使用词嵌入聚类
|
|
88
|
+
console.log('使用词嵌入进行语义聚类...');
|
|
89
|
+
|
|
90
|
+
const wordsWithEmbeddings = [];
|
|
91
|
+
// 每1000个词显示进度
|
|
92
|
+
const progressInterval = Math.max(1, Math.floor(words.length / 10));
|
|
93
|
+
|
|
94
|
+
for (let i = 0; i < words.length; i++) {
|
|
95
|
+
const word = words[i];
|
|
96
|
+
const embedding = await getWordEmbedding(word);
|
|
97
|
+
wordsWithEmbeddings.push({ word, embedding });
|
|
98
|
+
|
|
99
|
+
if ((i + 1) % progressInterval === 0 || i === words.length - 1) {
|
|
100
|
+
console.log(`进度: ${i+1}/${words.length} (${((i+1)/words.length*100).toFixed(1)}%)`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// 简单K-means聚类
|
|
105
|
+
const domainCentroids = await initializeCentroids(wordsWithEmbeddings, DOMAINS.length);
|
|
106
|
+
|
|
107
|
+
// 分配词到最近的质心
|
|
108
|
+
wordsWithEmbeddings.forEach(({ word, embedding }) => {
|
|
109
|
+
let minDist = Infinity;
|
|
110
|
+
let bestDomain = DOMAINS[0];
|
|
111
|
+
|
|
112
|
+
for (let i = 0; i < domainCentroids.length; i++) {
|
|
113
|
+
const dist = euclideanDistance(embedding, domainCentroids[i]);
|
|
114
|
+
if (dist < minDist) {
|
|
115
|
+
minDist = dist;
|
|
116
|
+
bestDomain = DOMAINS[i];
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
domainWords[bestDomain].push(word);
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// 报告每个领域的词数
|
|
125
|
+
for (const domain of DOMAINS) {
|
|
126
|
+
console.log(`${domain}: ${domainWords[domain].length} 词`);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return domainWords;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function euclideanDistance(vec1, vec2) {
|
|
133
|
+
let sum = 0;
|
|
134
|
+
for (let i = 0; i < vec1.length; i++) {
|
|
135
|
+
sum += Math.pow(vec1[i] - vec2[i], 2);
|
|
136
|
+
}
|
|
137
|
+
return Math.sqrt(sum);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async function initializeCentroids(wordsWithEmbeddings, k) {
|
|
141
|
+
// 简单的质心初始化: 随机选择k个点
|
|
142
|
+
const shuffled = [...wordsWithEmbeddings].sort(() => 0.5 - Math.random());
|
|
143
|
+
return shuffled.slice(0, k).map(w => w.embedding);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function splitIntoShards(domainWords) {
|
|
147
|
+
console.log('将词分成固定大小的分片...');
|
|
148
|
+
const shards = [];
|
|
149
|
+
|
|
150
|
+
for (const domain of DOMAINS) {
|
|
151
|
+
const words = domainWords[domain];
|
|
152
|
+
let currentShard = [];
|
|
153
|
+
let currentSize = 0;
|
|
154
|
+
let shardCount = 0;
|
|
155
|
+
|
|
156
|
+
words.forEach(word => {
|
|
157
|
+
// 估计这个词占用的字节数 (简单估计,utf8编码下每个字符1-4字节)
|
|
158
|
+
const wordSize = Buffer.from(word).length + 1; // +1 是换行符
|
|
159
|
+
|
|
160
|
+
if (currentSize + wordSize > SHARD_SIZE && currentShard.length > 0) {
|
|
161
|
+
// 保存当前分片
|
|
162
|
+
shards.push({
|
|
163
|
+
domain,
|
|
164
|
+
index: shardCount++,
|
|
165
|
+
words: currentShard,
|
|
166
|
+
size: currentSize
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
// 开始新分片
|
|
170
|
+
currentShard = [word];
|
|
171
|
+
currentSize = wordSize;
|
|
172
|
+
} else {
|
|
173
|
+
currentShard.push(word);
|
|
174
|
+
currentSize += wordSize;
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
// 不要忘记最后一个分片
|
|
179
|
+
if (currentShard.length > 0) {
|
|
180
|
+
shards.push({
|
|
181
|
+
domain,
|
|
182
|
+
index: shardCount++,
|
|
183
|
+
words: currentShard,
|
|
184
|
+
size: currentSize
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
console.log(`总共创建了 ${shards.length} 个分片`);
|
|
190
|
+
return shards;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function saveShards(shards) {
|
|
194
|
+
console.log('保存分片到文件...');
|
|
195
|
+
|
|
196
|
+
shards.forEach((shard, idx) => {
|
|
197
|
+
const filename = `${shard.domain}_${shard.index}.csv`;
|
|
198
|
+
const filePath = path.join(OUTPUT_DIR, filename);
|
|
199
|
+
|
|
200
|
+
const content = shard.words.join('\n');
|
|
201
|
+
fs.writeFileSync(filePath, content);
|
|
202
|
+
|
|
203
|
+
console.log(`已保存: ${filename} (${shard.words.length} 词, ${Math.round(shard.size/1024)} KB)`);
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
// 创建分片索引文件
|
|
207
|
+
const shardIndex = shards.map((shard, idx) => ({
|
|
208
|
+
id: idx,
|
|
209
|
+
filename: `${shard.domain}_${shard.index}.csv`,
|
|
210
|
+
domain: shard.domain,
|
|
211
|
+
wordCount: shard.words.length,
|
|
212
|
+
sizeKB: Math.round(shard.size/1024)
|
|
213
|
+
}));
|
|
214
|
+
|
|
215
|
+
fs.writeFileSync(
|
|
216
|
+
path.join(OUTPUT_DIR, 'shard_index.json'),
|
|
217
|
+
JSON.stringify(shardIndex, null, 2)
|
|
218
|
+
);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
async function main() {
|
|
222
|
+
try {
|
|
223
|
+
console.log('开始词表分片...');
|
|
224
|
+
// 1. 加载所有词表
|
|
225
|
+
const allWords = await loadAllVocabulary();
|
|
226
|
+
|
|
227
|
+
// 2. 将词分配到不同领域
|
|
228
|
+
const domainWords = await assignWordsToDomains(allWords);
|
|
229
|
+
|
|
230
|
+
// 3. 将每个领域的词分成固定大小的分片
|
|
231
|
+
const shards = splitIntoShards(domainWords);
|
|
232
|
+
|
|
233
|
+
// 4. 保存分片
|
|
234
|
+
saveShards(shards);
|
|
235
|
+
|
|
236
|
+
console.log('词表分片完成!');
|
|
237
|
+
} catch (error) {
|
|
238
|
+
console.error('词表分片失败:', error);
|
|
239
|
+
process.exit(1);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// 启动主流程
|
|
244
|
+
if (require.main === module) {
|
|
245
|
+
main();
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
module.exports = {
|
|
249
|
+
loadAllVocabulary,
|
|
250
|
+
assignWordsToDomains,
|
|
251
|
+
splitIntoShards,
|
|
252
|
+
saveShards
|
|
253
|
+
};
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
|
2
|
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
|
3
|
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
4
|
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
|
5
|
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
|
6
|
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
|
7
|
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
|
8
|
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
|
9
|
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
|
10
|
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
|
11
|
+
*.model filter=lfs diff=lfs merge=lfs -text
|
|
12
|
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
13
|
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
|
14
|
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
|
15
|
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
|
16
|
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
|
17
|
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
|
18
|
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
|
19
|
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
|
20
|
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
21
|
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
22
|
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
|
23
|
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
24
|
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
|
25
|
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
|
26
|
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
|
27
|
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
---
|
|
2
|
+
annotations_creators:
|
|
3
|
+
- no-annotation
|
|
4
|
+
language_creators:
|
|
5
|
+
- crowdsourced
|
|
6
|
+
language:
|
|
7
|
+
- en
|
|
8
|
+
license:
|
|
9
|
+
- cc-by-sa-3.0
|
|
10
|
+
- gfdl
|
|
11
|
+
multilinguality:
|
|
12
|
+
- monolingual
|
|
13
|
+
size_categories:
|
|
14
|
+
- 1M<n<10M
|
|
15
|
+
source_datasets:
|
|
16
|
+
- original
|
|
17
|
+
task_categories:
|
|
18
|
+
- text-generation
|
|
19
|
+
- fill-mask
|
|
20
|
+
task_ids:
|
|
21
|
+
- language-modeling
|
|
22
|
+
- masked-language-modeling
|
|
23
|
+
paperswithcode_id: wikitext-2
|
|
24
|
+
pretty_name: WikiText
|
|
25
|
+
dataset_info:
|
|
26
|
+
- config_name: wikitext-103-raw-v1
|
|
27
|
+
features:
|
|
28
|
+
- name: text
|
|
29
|
+
dtype: string
|
|
30
|
+
splits:
|
|
31
|
+
- name: test
|
|
32
|
+
num_bytes: 1305088
|
|
33
|
+
num_examples: 4358
|
|
34
|
+
- name: train
|
|
35
|
+
num_bytes: 546500949
|
|
36
|
+
num_examples: 1801350
|
|
37
|
+
- name: validation
|
|
38
|
+
num_bytes: 1159288
|
|
39
|
+
num_examples: 3760
|
|
40
|
+
download_size: 315466397
|
|
41
|
+
dataset_size: 548965325
|
|
42
|
+
- config_name: wikitext-103-v1
|
|
43
|
+
features:
|
|
44
|
+
- name: text
|
|
45
|
+
dtype: string
|
|
46
|
+
splits:
|
|
47
|
+
- name: test
|
|
48
|
+
num_bytes: 1295575
|
|
49
|
+
num_examples: 4358
|
|
50
|
+
- name: train
|
|
51
|
+
num_bytes: 545141915
|
|
52
|
+
num_examples: 1801350
|
|
53
|
+
- name: validation
|
|
54
|
+
num_bytes: 1154751
|
|
55
|
+
num_examples: 3760
|
|
56
|
+
download_size: 313093838
|
|
57
|
+
dataset_size: 547592241
|
|
58
|
+
- config_name: wikitext-2-raw-v1
|
|
59
|
+
features:
|
|
60
|
+
- name: text
|
|
61
|
+
dtype: string
|
|
62
|
+
splits:
|
|
63
|
+
- name: test
|
|
64
|
+
num_bytes: 1305088
|
|
65
|
+
num_examples: 4358
|
|
66
|
+
- name: train
|
|
67
|
+
num_bytes: 11061717
|
|
68
|
+
num_examples: 36718
|
|
69
|
+
- name: validation
|
|
70
|
+
num_bytes: 1159288
|
|
71
|
+
num_examples: 3760
|
|
72
|
+
download_size: 7747362
|
|
73
|
+
dataset_size: 13526093
|
|
74
|
+
- config_name: wikitext-2-v1
|
|
75
|
+
features:
|
|
76
|
+
- name: text
|
|
77
|
+
dtype: string
|
|
78
|
+
splits:
|
|
79
|
+
- name: test
|
|
80
|
+
num_bytes: 1270947
|
|
81
|
+
num_examples: 4358
|
|
82
|
+
- name: train
|
|
83
|
+
num_bytes: 10918118
|
|
84
|
+
num_examples: 36718
|
|
85
|
+
- name: validation
|
|
86
|
+
num_bytes: 1134123
|
|
87
|
+
num_examples: 3760
|
|
88
|
+
download_size: 7371282
|
|
89
|
+
dataset_size: 13323188
|
|
90
|
+
configs:
|
|
91
|
+
- config_name: wikitext-103-raw-v1
|
|
92
|
+
data_files:
|
|
93
|
+
- split: test
|
|
94
|
+
path: wikitext-103-raw-v1/test-*
|
|
95
|
+
- split: train
|
|
96
|
+
path: wikitext-103-raw-v1/train-*
|
|
97
|
+
- split: validation
|
|
98
|
+
path: wikitext-103-raw-v1/validation-*
|
|
99
|
+
- config_name: wikitext-103-v1
|
|
100
|
+
data_files:
|
|
101
|
+
- split: test
|
|
102
|
+
path: wikitext-103-v1/test-*
|
|
103
|
+
- split: train
|
|
104
|
+
path: wikitext-103-v1/train-*
|
|
105
|
+
- split: validation
|
|
106
|
+
path: wikitext-103-v1/validation-*
|
|
107
|
+
- config_name: wikitext-2-raw-v1
|
|
108
|
+
data_files:
|
|
109
|
+
- split: test
|
|
110
|
+
path: wikitext-2-raw-v1/test-*
|
|
111
|
+
- split: train
|
|
112
|
+
path: wikitext-2-raw-v1/train-*
|
|
113
|
+
- split: validation
|
|
114
|
+
path: wikitext-2-raw-v1/validation-*
|
|
115
|
+
- config_name: wikitext-2-v1
|
|
116
|
+
data_files:
|
|
117
|
+
- split: test
|
|
118
|
+
path: wikitext-2-v1/test-*
|
|
119
|
+
- split: train
|
|
120
|
+
path: wikitext-2-v1/train-*
|
|
121
|
+
- split: validation
|
|
122
|
+
path: wikitext-2-v1/validation-*
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
# Dataset Card for "wikitext"
|
|
126
|
+
|
|
127
|
+
## Table of Contents
|
|
128
|
+
- [Dataset Description](#dataset-description)
|
|
129
|
+
- [Dataset Summary](#dataset-summary)
|
|
130
|
+
- [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
|
|
131
|
+
- [Languages](#languages)
|
|
132
|
+
- [Dataset Structure](#dataset-structure)
|
|
133
|
+
- [Data Instances](#data-instances)
|
|
134
|
+
- [Data Fields](#data-fields)
|
|
135
|
+
- [Data Splits](#data-splits)
|
|
136
|
+
- [Dataset Creation](#dataset-creation)
|
|
137
|
+
- [Curation Rationale](#curation-rationale)
|
|
138
|
+
- [Source Data](#source-data)
|
|
139
|
+
- [Annotations](#annotations)
|
|
140
|
+
- [Personal and Sensitive Information](#personal-and-sensitive-information)
|
|
141
|
+
- [Considerations for Using the Data](#considerations-for-using-the-data)
|
|
142
|
+
- [Social Impact of Dataset](#social-impact-of-dataset)
|
|
143
|
+
- [Discussion of Biases](#discussion-of-biases)
|
|
144
|
+
- [Other Known Limitations](#other-known-limitations)
|
|
145
|
+
- [Additional Information](#additional-information)
|
|
146
|
+
- [Dataset Curators](#dataset-curators)
|
|
147
|
+
- [Licensing Information](#licensing-information)
|
|
148
|
+
- [Citation Information](#citation-information)
|
|
149
|
+
- [Contributions](#contributions)
|
|
150
|
+
|
|
151
|
+
## Dataset Description
|
|
152
|
+
|
|
153
|
+
- **Homepage:** [https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)
|
|
154
|
+
- **Repository:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
155
|
+
- **Paper:** [Pointer Sentinel Mixture Models](https://arxiv.org/abs/1609.07843)
|
|
156
|
+
- **Point of Contact:** [Stephen Merity](mailto:smerity@salesforce.com)
|
|
157
|
+
- **Size of downloaded dataset files:** 391.41 MB
|
|
158
|
+
- **Size of the generated dataset:** 1.12 GB
|
|
159
|
+
- **Total amount of disk used:** 1.52 GB
|
|
160
|
+
|
|
161
|
+
### Dataset Summary
|
|
162
|
+
|
|
163
|
+
The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified
|
|
164
|
+
Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.
|
|
165
|
+
|
|
166
|
+
Compared to the preprocessed version of Penn Treebank (PTB), WikiText-2 is over 2 times larger and WikiText-103 is over
|
|
167
|
+
110 times larger. The WikiText dataset also features a far larger vocabulary and retains the original case, punctuation
|
|
168
|
+
and numbers - all of which are removed in PTB. As it is composed of full articles, the dataset is well suited for models
|
|
169
|
+
that can take advantage of long term dependencies.
|
|
170
|
+
|
|
171
|
+
Each subset comes in two different variants:
|
|
172
|
+
- Raw (for character level work) contain the raw tokens, before the addition of the <unk> (unknown) tokens.
|
|
173
|
+
- Non-raw (for word level work) contain only the tokens in their vocabulary (wiki.train.tokens, wiki.valid.tokens, and wiki.test.tokens).
|
|
174
|
+
The out-of-vocabulary tokens have been replaced with the the <unk> token.
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
### Supported Tasks and Leaderboards
|
|
178
|
+
|
|
179
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
180
|
+
|
|
181
|
+
### Languages
|
|
182
|
+
|
|
183
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
184
|
+
|
|
185
|
+
## Dataset Structure
|
|
186
|
+
|
|
187
|
+
### Data Instances
|
|
188
|
+
|
|
189
|
+
#### wikitext-103-raw-v1
|
|
190
|
+
|
|
191
|
+
- **Size of downloaded dataset files:** 191.98 MB
|
|
192
|
+
- **Size of the generated dataset:** 549.42 MB
|
|
193
|
+
- **Total amount of disk used:** 741.41 MB
|
|
194
|
+
|
|
195
|
+
An example of 'validation' looks as follows.
|
|
196
|
+
```
|
|
197
|
+
This example was too long and was cropped:
|
|
198
|
+
|
|
199
|
+
{
|
|
200
|
+
"text": "\" The gold dollar or gold one @-@ dollar piece was a coin struck as a regular issue by the United States Bureau of the Mint from..."
|
|
201
|
+
}
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
#### wikitext-103-v1
|
|
205
|
+
|
|
206
|
+
- **Size of downloaded dataset files:** 190.23 MB
|
|
207
|
+
- **Size of the generated dataset:** 548.05 MB
|
|
208
|
+
- **Total amount of disk used:** 738.27 MB
|
|
209
|
+
|
|
210
|
+
An example of 'train' looks as follows.
|
|
211
|
+
```
|
|
212
|
+
This example was too long and was cropped:
|
|
213
|
+
|
|
214
|
+
{
|
|
215
|
+
"text": "\" Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to..."
|
|
216
|
+
}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
#### wikitext-2-raw-v1
|
|
220
|
+
|
|
221
|
+
- **Size of downloaded dataset files:** 4.72 MB
|
|
222
|
+
- **Size of the generated dataset:** 13.54 MB
|
|
223
|
+
- **Total amount of disk used:** 18.26 MB
|
|
224
|
+
|
|
225
|
+
An example of 'train' looks as follows.
|
|
226
|
+
```
|
|
227
|
+
This example was too long and was cropped:
|
|
228
|
+
|
|
229
|
+
{
|
|
230
|
+
"text": "\" The Sinclair Scientific Programmable was introduced in 1975 , with the same case as the Sinclair Oxford . It was larger than t..."
|
|
231
|
+
}
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
#### wikitext-2-v1
|
|
235
|
+
|
|
236
|
+
- **Size of downloaded dataset files:** 4.48 MB
|
|
237
|
+
- **Size of the generated dataset:** 13.34 MB
|
|
238
|
+
- **Total amount of disk used:** 17.82 MB
|
|
239
|
+
|
|
240
|
+
An example of 'train' looks as follows.
|
|
241
|
+
```
|
|
242
|
+
This example was too long and was cropped:
|
|
243
|
+
|
|
244
|
+
{
|
|
245
|
+
"text": "\" Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to..."
|
|
246
|
+
}
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### Data Fields
|
|
250
|
+
|
|
251
|
+
The data fields are the same among all splits.
|
|
252
|
+
|
|
253
|
+
#### wikitext-103-raw-v1
|
|
254
|
+
- `text`: a `string` feature.
|
|
255
|
+
|
|
256
|
+
#### wikitext-103-v1
|
|
257
|
+
- `text`: a `string` feature.
|
|
258
|
+
|
|
259
|
+
#### wikitext-2-raw-v1
|
|
260
|
+
- `text`: a `string` feature.
|
|
261
|
+
|
|
262
|
+
#### wikitext-2-v1
|
|
263
|
+
- `text`: a `string` feature.
|
|
264
|
+
|
|
265
|
+
### Data Splits
|
|
266
|
+
|
|
267
|
+
| name | train |validation|test|
|
|
268
|
+
|-------------------|------:|---------:|---:|
|
|
269
|
+
|wikitext-103-raw-v1|1801350| 3760|4358|
|
|
270
|
+
|wikitext-103-v1 |1801350| 3760|4358|
|
|
271
|
+
|wikitext-2-raw-v1 | 36718| 3760|4358|
|
|
272
|
+
|wikitext-2-v1 | 36718| 3760|4358|
|
|
273
|
+
|
|
274
|
+
## Dataset Creation
|
|
275
|
+
|
|
276
|
+
### Curation Rationale
|
|
277
|
+
|
|
278
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
279
|
+
|
|
280
|
+
### Source Data
|
|
281
|
+
|
|
282
|
+
#### Initial Data Collection and Normalization
|
|
283
|
+
|
|
284
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
285
|
+
|
|
286
|
+
#### Who are the source language producers?
|
|
287
|
+
|
|
288
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
289
|
+
|
|
290
|
+
### Annotations
|
|
291
|
+
|
|
292
|
+
#### Annotation process
|
|
293
|
+
|
|
294
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
295
|
+
|
|
296
|
+
#### Who are the annotators?
|
|
297
|
+
|
|
298
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
299
|
+
|
|
300
|
+
### Personal and Sensitive Information
|
|
301
|
+
|
|
302
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
303
|
+
|
|
304
|
+
## Considerations for Using the Data
|
|
305
|
+
|
|
306
|
+
### Social Impact of Dataset
|
|
307
|
+
|
|
308
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
309
|
+
|
|
310
|
+
### Discussion of Biases
|
|
311
|
+
|
|
312
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
313
|
+
|
|
314
|
+
### Other Known Limitations
|
|
315
|
+
|
|
316
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
317
|
+
|
|
318
|
+
## Additional Information
|
|
319
|
+
|
|
320
|
+
### Dataset Curators
|
|
321
|
+
|
|
322
|
+
[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
|
|
323
|
+
|
|
324
|
+
### Licensing Information
|
|
325
|
+
|
|
326
|
+
The dataset is available under the [Creative Commons Attribution-ShareAlike License (CC BY-SA 4.0)](https://creativecommons.org/licenses/by-sa/4.0/).
|
|
327
|
+
|
|
328
|
+
### Citation Information
|
|
329
|
+
|
|
330
|
+
```
|
|
331
|
+
@misc{merity2016pointer,
|
|
332
|
+
title={Pointer Sentinel Mixture Models},
|
|
333
|
+
author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
|
|
334
|
+
year={2016},
|
|
335
|
+
eprint={1609.07843},
|
|
336
|
+
archivePrefix={arXiv},
|
|
337
|
+
primaryClass={cs.CL}
|
|
338
|
+
}
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
### Contributions
|
|
343
|
+
|
|
344
|
+
Thanks to [@thomwolf](https://github.com/thomwolf), [@lewtun](https://github.com/lewtun), [@patrickvonplaten](https://github.com/patrickvonplaten), [@mariamabarham](https://github.com/mariamabarham) for adding this dataset.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
说明:这个仓库来自https://hf-mirror.com/datasets/Salesforce/wikitext
|