@llmindset/hf-mcp 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/dataset-detail.d.ts +26 -0
- package/dist/dataset-detail.d.ts.map +1 -0
- package/dist/dataset-detail.js +157 -0
- package/dist/dataset-detail.js.map +1 -0
- package/dist/dataset-search.d.ts +62 -0
- package/dist/dataset-search.d.ts.map +1 -0
- package/dist/dataset-search.js +158 -0
- package/dist/dataset-search.js.map +1 -0
- package/dist/duplicate-space.d.ts +75 -0
- package/dist/duplicate-space.d.ts.map +1 -0
- package/dist/duplicate-space.js +189 -0
- package/dist/duplicate-space.js.map +1 -0
- package/dist/error-messages.d.ts +4 -0
- package/dist/error-messages.d.ts.map +1 -0
- package/dist/error-messages.js +30 -0
- package/dist/error-messages.js.map +1 -0
- package/dist/hf-api-call.d.ts +18 -0
- package/dist/hf-api-call.d.ts.map +1 -0
- package/dist/hf-api-call.js +105 -0
- package/dist/hf-api-call.js.map +1 -0
- package/dist/index.d.ts +16 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/index.js.map +1 -0
- package/dist/model-detail.d.ts +26 -0
- package/dist/model-detail.d.ts.map +1 -0
- package/dist/model-detail.js +224 -0
- package/dist/model-detail.js.map +1 -0
- package/dist/model-search.d.ts +64 -0
- package/dist/model-search.d.ts.map +1 -0
- package/dist/model-search.js +161 -0
- package/dist/model-search.js.map +1 -0
- package/dist/paper-search.d.ts +58 -0
- package/dist/paper-search.d.ts.map +1 -0
- package/dist/paper-search.js +114 -0
- package/dist/paper-search.js.map +1 -0
- package/dist/paper-summary.d.ts +35 -0
- package/dist/paper-summary.d.ts.map +1 -0
- package/dist/paper-summary.js +187 -0
- package/dist/paper-summary.js.map +1 -0
- package/dist/space-files.d.ts +44 -0
- package/dist/space-files.d.ts.map +1 -0
- package/dist/space-files.js +242 -0
- package/dist/space-files.js.map +1 -0
- package/dist/space-info.d.ts +56 -0
- package/dist/space-info.d.ts.map +1 -0
- package/dist/space-info.js +135 -0
- package/dist/space-info.js.map +1 -0
- package/dist/space-search.d.ts +71 -0
- package/dist/space-search.d.ts.map +1 -0
- package/dist/space-search.js +95 -0
- package/dist/space-search.js.map +1 -0
- package/dist/tool-ids.d.ts +23 -0
- package/dist/tool-ids.d.ts.map +1 -0
- package/dist/tool-ids.js +55 -0
- package/dist/tool-ids.js.map +1 -0
- package/dist/user-summary.d.ts +56 -0
- package/dist/user-summary.d.ts.map +1 -0
- package/dist/user-summary.js +271 -0
- package/dist/user-summary.js.map +1 -0
- package/dist/utilities.d.ts +8 -0
- package/dist/utilities.d.ts.map +1 -0
- package/dist/utilities.js +53 -0
- package/dist/utilities.js.map +1 -0
- package/eslint.config.js +43 -0
- package/package.json +47 -0
- package/src/dataset-detail.ts +257 -0
- package/src/dataset-search.ts +237 -0
- package/src/duplicate-space.ts +263 -0
- package/src/error-messages.ts +57 -0
- package/src/hf-api-call.ts +182 -0
- package/src/index.ts +18 -0
- package/src/model-detail.ts +359 -0
- package/src/model-search.ts +231 -0
- package/src/paper-search.ts +188 -0
- package/src/paper-summary.ts +303 -0
- package/src/space-files.ts +325 -0
- package/src/space-info.ts +190 -0
- package/src/space-search.ts +177 -0
- package/src/tool-ids.ts +84 -0
- package/src/user-summary.ts +421 -0
- package/src/utilities.ts +64 -0
- package/test/duplicate-space.spec.ts +41 -0
- package/test/fixtures/paper_result_kazakh.json +854 -0
- package/test/fixtures/space-result.json +263 -0
- package/test/paper-search.spec.ts +57 -0
- package/test/paper-summary.spec.ts +113 -0
- package/test/space-files.spec.ts +232 -0
- package/test/space-search.spec.ts +29 -0
- package/test/user-summary.spec.ts +131 -0
- package/tsconfig.json +31 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,854 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"paper": {
|
|
4
|
+
"id": "2404.04487",
|
|
5
|
+
"authors": [
|
|
6
|
+
{
|
|
7
|
+
"_id": "661508c74476535462a61577",
|
|
8
|
+
"user": {
|
|
9
|
+
"_id": "6426341dad1e3b0e6e90ebd6",
|
|
10
|
+
"avatarUrl": "/avatars/449b0957c458b8aedb6e56b015852bc3.svg",
|
|
11
|
+
"isPro": false,
|
|
12
|
+
"fullname": "Rustem Yeshpanov",
|
|
13
|
+
"user": "yeshpanovrustem",
|
|
14
|
+
"type": "user"
|
|
15
|
+
},
|
|
16
|
+
"name": "Rustem Yeshpanov",
|
|
17
|
+
"status": "claimed_verified",
|
|
18
|
+
"statusLastChangedAt": "2024-11-29T10:08:36.253Z",
|
|
19
|
+
"hidden": false
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"_id": "661508c74476535462a61578",
|
|
23
|
+
"user": {
|
|
24
|
+
"_id": "603f8056076aa73940921525",
|
|
25
|
+
"avatarUrl": "/avatars/6aeffe1021af17ced8480a4c718083f6.svg",
|
|
26
|
+
"isPro": false,
|
|
27
|
+
"fullname": "Pavel Efimov",
|
|
28
|
+
"user": "pefimov",
|
|
29
|
+
"type": "user"
|
|
30
|
+
},
|
|
31
|
+
"name": "Pavel Efimov",
|
|
32
|
+
"status": "extracted_confirmed",
|
|
33
|
+
"statusLastChangedAt": "2024-04-09T09:23:55.492Z",
|
|
34
|
+
"hidden": false
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"_id": "661508c74476535462a61579",
|
|
38
|
+
"user": {
|
|
39
|
+
"_id": "61623a7bb45ca126bca65688",
|
|
40
|
+
"avatarUrl": "/avatars/cdbd04afdb5401d1cbbd390416f3c1e3.svg",
|
|
41
|
+
"isPro": false,
|
|
42
|
+
"fullname": "Leo Boytsov",
|
|
43
|
+
"user": "searchivarius",
|
|
44
|
+
"type": "user"
|
|
45
|
+
},
|
|
46
|
+
"name": "Leonid Boytsov",
|
|
47
|
+
"status": "extracted_confirmed",
|
|
48
|
+
"statusLastChangedAt": "2024-04-09T17:59:21.864Z",
|
|
49
|
+
"hidden": false
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"_id": "661508c74476535462a6157a",
|
|
53
|
+
"user": {
|
|
54
|
+
"_id": "63a2880ce36f2e4d5b2340a3",
|
|
55
|
+
"avatarUrl": "/avatars/753ea60d96bc3109d1a3a40f3ac19256.svg",
|
|
56
|
+
"isPro": false,
|
|
57
|
+
"fullname": "Ardak Shalkarbayuly",
|
|
58
|
+
"user": "ardakshalkar",
|
|
59
|
+
"type": "user"
|
|
60
|
+
},
|
|
61
|
+
"name": "Ardak Shalkarbayuli",
|
|
62
|
+
"status": "extracted_pending",
|
|
63
|
+
"statusLastChangedAt": "2024-04-09T09:22:15.652Z",
|
|
64
|
+
"hidden": false
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"_id": "661508c74476535462a6157b",
|
|
68
|
+
"user": {
|
|
69
|
+
"_id": "6361fd8dda0599dc08caac11",
|
|
70
|
+
"avatarUrl": "/avatars/30c26ccb60282d2567c40749965a9708.svg",
|
|
71
|
+
"isPro": false,
|
|
72
|
+
"fullname": "Pavel Braslavski",
|
|
73
|
+
"user": "pbras",
|
|
74
|
+
"type": "user"
|
|
75
|
+
},
|
|
76
|
+
"name": "Pavel Braslavski",
|
|
77
|
+
"status": "extracted_pending",
|
|
78
|
+
"statusLastChangedAt": "2024-04-09T09:22:15.652Z",
|
|
79
|
+
"hidden": false
|
|
80
|
+
}
|
|
81
|
+
],
|
|
82
|
+
"publishedAt": "2024-04-06T03:40:36.000Z",
|
|
83
|
+
"title": "KazQAD: Kazakh Open-Domain Question Answering Dataset",
|
|
84
|
+
"summary": "We introduce KazQAD -- a Kazakh open-domain question answering (ODQA) dataset\n-- that can be used in both reading comprehension and full ODQA settings, as\nwell as for information retrieval experiments. KazQAD contains just under 6,000\nunique questions with extracted short answers and nearly 12,000 passage-level\nrelevance judgements. We use a combination of machine translation, Wikipedia\nsearch, and in-house manual annotation to ensure annotation efficiency and data\nquality. The questions come from two sources: translated items from the Natural\nQuestions (NQ) dataset (only for training) and the original Kazakh Unified\nNational Testing (UNT) exam (for development and testing). The accompanying\ntext corpus contains more than 800,000 passages from the Kazakh Wikipedia. As a\nsupplementary dataset, we release around 61,000 question-passage-answer triples\nfrom the NQ dataset that have been machine-translated into Kazakh. We develop\nbaseline retrievers and readers that achieve reasonable scores in retrieval\n(NDCG@10 = 0.389 MRR = 0.382), reading comprehension (EM = 38.5 F1 = 54.2), and\nfull ODQA (EM = 17.8 F1 = 28.7) settings. Nevertheless, these results are\nsubstantially lower than state-of-the-art results for English QA collections,\nand we think that there should still be ample room for improvement. We also\nshow that the current OpenAI's ChatGPTv3.5 is not able to answer KazQAD test\nquestions in the closed-book setting with acceptable quality. The dataset is\nfreely available under the Creative Commons licence (CC BY-SA) at\nhttps://github.com/IS2AI/KazQAD.",
|
|
85
|
+
"upvotes": 1,
|
|
86
|
+
"discussionId": "661508c74476535462a6158d",
|
|
87
|
+
"ai_keywords": [
|
|
88
|
+
"open-domain question answering (ODQA)",
|
|
89
|
+
"reading comprehension",
|
|
90
|
+
"full ODQA",
|
|
91
|
+
"information retrieval",
|
|
92
|
+
"machine translation",
|
|
93
|
+
"Wikipedia search",
|
|
94
|
+
"Natural Questions (NQ) dataset",
|
|
95
|
+
"Kazakh Unified National Testing (UNT) exam",
|
|
96
|
+
"NDCG@10",
|
|
97
|
+
"MRR",
|
|
98
|
+
"EM",
|
|
99
|
+
"F1",
|
|
100
|
+
"closed-book setting",
|
|
101
|
+
"ChatGPTv3.5"
|
|
102
|
+
]
|
|
103
|
+
},
|
|
104
|
+
"publishedAt": "2024-04-05T23:40:36.000Z",
|
|
105
|
+
"title": "KazQAD: Kazakh Open-Domain Question Answering Dataset",
|
|
106
|
+
"summary": "We introduce KazQAD -- a Kazakh open-domain question answering (ODQA) dataset\n-- that can be used in both reading comprehension and full ODQA settings, as\nwell as for information retrieval experiments. KazQAD contains just under 6,000\nunique questions with extracted short answers and nearly 12,000 passage-level\nrelevance judgements. We use a combination of machine translation, Wikipedia\nsearch, and in-house manual annotation to ensure annotation efficiency and data\nquality. The questions come from two sources: translated items from the Natural\nQuestions (NQ) dataset (only for training) and the original Kazakh Unified\nNational Testing (UNT) exam (for development and testing). The accompanying\ntext corpus contains more than 800,000 passages from the Kazakh Wikipedia. As a\nsupplementary dataset, we release around 61,000 question-passage-answer triples\nfrom the NQ dataset that have been machine-translated into Kazakh. We develop\nbaseline retrievers and readers that achieve reasonable scores in retrieval\n(NDCG@10 = 0.389 MRR = 0.382), reading comprehension (EM = 38.5 F1 = 54.2), and\nfull ODQA (EM = 17.8 F1 = 28.7) settings. Nevertheless, these results are\nsubstantially lower than state-of-the-art results for English QA collections,\nand we think that there should still be ample room for improvement. We also\nshow that the current OpenAI's ChatGPTv3.5 is not able to answer KazQAD test\nquestions in the closed-book setting with acceptable quality. The dataset is\nfreely available under the Creative Commons licence (CC BY-SA) at\nhttps://github.com/IS2AI/KazQAD.",
|
|
107
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2404.04487.png",
|
|
108
|
+
"numComments": 0,
|
|
109
|
+
"upvoted": false,
|
|
110
|
+
"isAuthorParticipating": false,
|
|
111
|
+
"highlightedTitle": [
|
|
112
|
+
{ "type": "text", "text": "KazQAD: " },
|
|
113
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
114
|
+
{ "type": "text", "text": " Open-Domain Question Answering Dataset" }
|
|
115
|
+
],
|
|
116
|
+
"highlightedSummary": [
|
|
117
|
+
{ "type": "text", "text": "We introduce KazQAD -- a " },
|
|
118
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
119
|
+
{
|
|
120
|
+
"type": "text",
|
|
121
|
+
"text": " open-domain question answering (ODQA) dataset\n-- that can be used in both reading comprehension and full ODQA settings, as\nwell as for information retrieval experiments. KazQAD contains just under 6,000\nunique questions with extracted short answers and nearly 12,000 passage-level\nrelevance judgements. We use a combination of machine translation, Wikipedia\nsearch, and in-house manual annotation to ensure annotation efficiency and data\nquality. The questions come from two sources: translated items from the Natural\nQuestions (NQ) dataset (only for training) and the original Kazakh Unified\nNational Testing (UNT) exam (for development and testing). The accompanying\ntext corpus contains more than 800,000 passages from the Kazakh Wikipedia. As a\nsupplementary dataset, we release around 61,000 question-passage-answer triples\nfrom the NQ dataset that have been machine-translated into Kazakh. We develop\nbaseline retrievers and readers that achieve reasonable scores in retrieval\n(NDCG@10 = 0.389 MRR = 0.382), reading comprehension (EM = 38.5 F1 = 54.2), and\nfull ODQA (EM = 17.8 F1 = 28.7) settings. Nevertheless, these results are\nsubstantially lower than state-of-the-art results for English QA collections,\nand we think that there should still be ample room for improvement. We also\nshow that the current OpenAI's ChatGPTv3.5 is not able to answer KazQAD test\nquestions in the closed-book setting with acceptable quality. The dataset is\nfreely available under the Creative Commons licence (CC BY-SA) at\nhttps://github.com/IS2AI/KazQAD."
|
|
122
|
+
}
|
|
123
|
+
]
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"paper": {
|
|
127
|
+
"id": "2403.19399",
|
|
128
|
+
"authors": [
|
|
129
|
+
{
|
|
130
|
+
"_id": "66198cc11e0c93ea29ffe643",
|
|
131
|
+
"user": {
|
|
132
|
+
"_id": "6426341dad1e3b0e6e90ebd6",
|
|
133
|
+
"avatarUrl": "/avatars/449b0957c458b8aedb6e56b015852bc3.svg",
|
|
134
|
+
"isPro": false,
|
|
135
|
+
"fullname": "Rustem Yeshpanov",
|
|
136
|
+
"user": "yeshpanovrustem",
|
|
137
|
+
"type": "user"
|
|
138
|
+
},
|
|
139
|
+
"name": "Rustem Yeshpanov",
|
|
140
|
+
"status": "claimed_verified",
|
|
141
|
+
"statusLastChangedAt": "2024-11-29T10:08:41.389Z",
|
|
142
|
+
"hidden": false
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
"_id": "66198cc11e0c93ea29ffe644",
|
|
146
|
+
"name": "Alina Polonskaya",
|
|
147
|
+
"hidden": false
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
"_id": "66198cc11e0c93ea29ffe645",
|
|
151
|
+
"name": "Huseyin Atakan Varol",
|
|
152
|
+
"hidden": false
|
|
153
|
+
}
|
|
154
|
+
],
|
|
155
|
+
"publishedAt": "2024-03-28T13:19:16.000Z",
|
|
156
|
+
"title": "KazParC: Kazakh Parallel Corpus for Machine Translation",
|
|
157
|
+
"summary": "We introduce KazParC, a parallel corpus designed for machine translation\nacross Kazakh, English, Russian, and Turkish. The first and largest publicly\navailable corpus of its kind, KazParC contains a collection of 371,902 parallel\nsentences covering different domains and developed with the assistance of human\ntranslators. Our research efforts also extend to the development of a neural\nmachine translation model nicknamed Tilmash. Remarkably, the performance of\nTilmash is on par with, and in certain instances, surpasses that of industry\ngiants, such as Google Translate and Yandex Translate, as measured by standard\nevaluation metrics, such as BLEU and chrF. Both KazParC and Tilmash are openly\navailable for download under the Creative Commons Attribution 4.0 International\nLicense (CC BY 4.0) through our GitHub repository.",
|
|
158
|
+
"upvotes": 0,
|
|
159
|
+
"discussionId": "66198cc11e0c93ea29ffe669",
|
|
160
|
+
"ai_keywords": [
|
|
161
|
+
"parallel corpus",
|
|
162
|
+
"machine translation",
|
|
163
|
+
"parallel sentences",
|
|
164
|
+
"machine translation model",
|
|
165
|
+
"BLEU",
|
|
166
|
+
"chrF"
|
|
167
|
+
]
|
|
168
|
+
},
|
|
169
|
+
"publishedAt": "2024-03-28T09:19:16.000Z",
|
|
170
|
+
"title": "KazParC: Kazakh Parallel Corpus for Machine Translation",
|
|
171
|
+
"summary": "We introduce KazParC, a parallel corpus designed for machine translation\nacross Kazakh, English, Russian, and Turkish. The first and largest publicly\navailable corpus of its kind, KazParC contains a collection of 371,902 parallel\nsentences covering different domains and developed with the assistance of human\ntranslators. Our research efforts also extend to the development of a neural\nmachine translation model nicknamed Tilmash. Remarkably, the performance of\nTilmash is on par with, and in certain instances, surpasses that of industry\ngiants, such as Google Translate and Yandex Translate, as measured by standard\nevaluation metrics, such as BLEU and chrF. Both KazParC and Tilmash are openly\navailable for download under the Creative Commons Attribution 4.0 International\nLicense (CC BY 4.0) through our GitHub repository.",
|
|
172
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2403.19399.png",
|
|
173
|
+
"numComments": 0,
|
|
174
|
+
"upvoted": false,
|
|
175
|
+
"isAuthorParticipating": false,
|
|
176
|
+
"highlightedTitle": [
|
|
177
|
+
{ "type": "text", "text": "KazParC: " },
|
|
178
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
179
|
+
{ "type": "text", "text": " Parallel Corpus for Machine Translation" }
|
|
180
|
+
],
|
|
181
|
+
"highlightedSummary": [
|
|
182
|
+
{
|
|
183
|
+
"type": "text",
|
|
184
|
+
"text": "We introduce KazParC, a parallel corpus designed for machine translation\nacross "
|
|
185
|
+
},
|
|
186
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
187
|
+
{
|
|
188
|
+
"type": "text",
|
|
189
|
+
"text": ", English, Russian, and Turkish. The first and largest publicly\navailable corpus of its kind, KazParC contains a collection of 371,902 parallel\nsentences covering different domains and developed with the assistance of human\ntranslators. Our research efforts also extend to the development of a neural\nmachine translation model nicknamed Tilmash. Remarkably, the performance of\nTilmash is on par with, and in certain instances, surpasses that of industry\ngiants, such as Google Translate and Yandex Translate, as measured by standard\nevaluation metrics, such as BLEU and chrF. Both KazParC and Tilmash are openly\navailable for download under the Creative Commons Attribution 4.0 International\nLicense (CC BY 4.0) through our GitHub repository."
|
|
190
|
+
}
|
|
191
|
+
]
|
|
192
|
+
},
|
|
193
|
+
{
|
|
194
|
+
"paper": {
|
|
195
|
+
"id": "2403.19335",
|
|
196
|
+
"authors": [
|
|
197
|
+
{
|
|
198
|
+
"_id": "66061e688be9fc54368de92f",
|
|
199
|
+
"user": {
|
|
200
|
+
"_id": "6426341dad1e3b0e6e90ebd6",
|
|
201
|
+
"avatarUrl": "/avatars/449b0957c458b8aedb6e56b015852bc3.svg",
|
|
202
|
+
"isPro": false,
|
|
203
|
+
"fullname": "Rustem Yeshpanov",
|
|
204
|
+
"user": "yeshpanovrustem",
|
|
205
|
+
"type": "user"
|
|
206
|
+
},
|
|
207
|
+
"name": "Rustem Yeshpanov",
|
|
208
|
+
"status": "claimed_verified",
|
|
209
|
+
"statusLastChangedAt": "2024-11-22T10:02:19.912Z",
|
|
210
|
+
"hidden": false
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
"_id": "66061e688be9fc54368de930",
|
|
214
|
+
"name": "Huseyin Atakan Varol",
|
|
215
|
+
"hidden": false
|
|
216
|
+
}
|
|
217
|
+
],
|
|
218
|
+
"publishedAt": "2024-03-28T11:51:11.000Z",
|
|
219
|
+
"title": "KazSAnDRA: Kazakh Sentiment Analysis Dataset of Reviews and Attitudes",
|
|
220
|
+
"summary": "This paper presents KazSAnDRA, a dataset developed for Kazakh sentiment\nanalysis that is the first and largest publicly available dataset of its kind.\nKazSAnDRA comprises an extensive collection of 180,064 reviews obtained from\nvarious sources and includes numerical ratings ranging from 1 to 5, providing a\nquantitative representation of customer attitudes. The study also pursued the\nautomation of Kazakh sentiment classification through the development and\nevaluation of four machine learning models trained for both polarity\nclassification and score classification. Experimental analysis included\nevaluation of the results considering both balanced and imbalanced scenarios.\nThe most successful model attained an F1-score of 0.81 for polarity\nclassification and 0.39 for score classification on the test sets. The dataset\nand fine-tuned models are open access and available for download under the\nCreative Commons Attribution 4.0 International License (CC BY 4.0) through our\nGitHub repository.",
|
|
221
|
+
"upvotes": 0,
|
|
222
|
+
"discussionId": "66061e688be9fc54368de946"
|
|
223
|
+
},
|
|
224
|
+
"publishedAt": "2024-03-28T07:51:11.000Z",
|
|
225
|
+
"title": "KazSAnDRA: Kazakh Sentiment Analysis Dataset of Reviews and Attitudes",
|
|
226
|
+
"summary": "This paper presents KazSAnDRA, a dataset developed for Kazakh sentiment\nanalysis that is the first and largest publicly available dataset of its kind.\nKazSAnDRA comprises an extensive collection of 180,064 reviews obtained from\nvarious sources and includes numerical ratings ranging from 1 to 5, providing a\nquantitative representation of customer attitudes. The study also pursued the\nautomation of Kazakh sentiment classification through the development and\nevaluation of four machine learning models trained for both polarity\nclassification and score classification. Experimental analysis included\nevaluation of the results considering both balanced and imbalanced scenarios.\nThe most successful model attained an F1-score of 0.81 for polarity\nclassification and 0.39 for score classification on the test sets. The dataset\nand fine-tuned models are open access and available for download under the\nCreative Commons Attribution 4.0 International License (CC BY 4.0) through our\nGitHub repository.",
|
|
227
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2403.19335.png",
|
|
228
|
+
"numComments": 0,
|
|
229
|
+
"upvoted": false,
|
|
230
|
+
"isAuthorParticipating": false,
|
|
231
|
+
"highlightedTitle": [
|
|
232
|
+
{ "type": "text", "text": "KazSAnDRA: " },
|
|
233
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
234
|
+
{
|
|
235
|
+
"type": "text",
|
|
236
|
+
"text": " Sentiment Analysis Dataset of Reviews and Attitudes"
|
|
237
|
+
}
|
|
238
|
+
],
|
|
239
|
+
"highlightedSummary": [
|
|
240
|
+
{
|
|
241
|
+
"type": "text",
|
|
242
|
+
"text": "This paper presents KazSAnDRA, a dataset developed for "
|
|
243
|
+
},
|
|
244
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
245
|
+
{
|
|
246
|
+
"type": "text",
|
|
247
|
+
"text": " sentiment\nanalysis that is the first and largest publicly available dataset of its kind.\nKazSAnDRA comprises an extensive collection of 180,064 reviews obtained from\nvarious sources and includes numerical ratings ranging from 1 to 5, providing a\nquantitative representation of customer attitudes. The study also pursued the\nautomation of Kazakh sentiment classification through the development and\nevaluation of four machine learning models trained for both polarity\nclassification and score classification. Experimental analysis included\nevaluation of the results considering both balanced and imbalanced scenarios.\nThe most successful model attained an F1-score of 0.81 for polarity\nclassification and 0.39 for score classification on the test sets. The dataset\nand fine-tuned models are open access and available for download under the\nCreative Commons Attribution 4.0 International License (CC BY 4.0) through our\nGitHub repository."
|
|
248
|
+
}
|
|
249
|
+
]
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
"paper": {
|
|
253
|
+
"id": "2111.13419",
|
|
254
|
+
"authors": [
|
|
255
|
+
{
|
|
256
|
+
"_id": "67404bb4f8ad6c78246b416d",
|
|
257
|
+
"user": {
|
|
258
|
+
"_id": "6426341dad1e3b0e6e90ebd6",
|
|
259
|
+
"avatarUrl": "/avatars/449b0957c458b8aedb6e56b015852bc3.svg",
|
|
260
|
+
"isPro": false,
|
|
261
|
+
"fullname": "Rustem Yeshpanov",
|
|
262
|
+
"user": "yeshpanovrustem",
|
|
263
|
+
"type": "user"
|
|
264
|
+
},
|
|
265
|
+
"name": "Rustem Yeshpanov",
|
|
266
|
+
"status": "claimed_verified",
|
|
267
|
+
"statusLastChangedAt": "2024-11-29T10:08:23.718Z",
|
|
268
|
+
"hidden": false
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
"_id": "67404bb4f8ad6c78246b416e",
|
|
272
|
+
"name": "Yerbolat Khassanov",
|
|
273
|
+
"hidden": false
|
|
274
|
+
},
|
|
275
|
+
{
|
|
276
|
+
"_id": "67404bb4f8ad6c78246b416f",
|
|
277
|
+
"name": "Huseyin Atakan Varol",
|
|
278
|
+
"hidden": false
|
|
279
|
+
}
|
|
280
|
+
],
|
|
281
|
+
"publishedAt": "2021-11-26T10:56:19.000Z",
|
|
282
|
+
"title": "KazNERD: Kazakh Named Entity Recognition Dataset",
|
|
283
|
+
"summary": "We present the development of a dataset for Kazakh named entity recognition.\nThe dataset was built as there is a clear need for publicly available annotated\ncorpora in Kazakh, as well as annotation guidelines containing\nstraightforward--but rigorous--rules and examples. The dataset annotation,\nbased on the IOB2 scheme, was carried out on television news text by two native\nKazakh speakers under the supervision of the first author. The resulting\ndataset contains 112,702 sentences and 136,333 annotations for 25 entity\nclasses. State-of-the-art machine learning models to automatise Kazakh named\nentity recognition were also built, with the best-performing model achieving an\nexact match F1-score of 97.22% on the test set. The annotated dataset,\nguidelines, and codes used to train the models are freely available for\ndownload under the CC BY 4.0 licence from https://github.com/IS2AI/KazNERD.",
|
|
284
|
+
"upvotes": 0,
|
|
285
|
+
"discussionId": "67404bb5f8ad6c78246b4197"
|
|
286
|
+
},
|
|
287
|
+
"publishedAt": "2021-11-26T05:56:19.000Z",
|
|
288
|
+
"title": "KazNERD: Kazakh Named Entity Recognition Dataset",
|
|
289
|
+
"summary": "We present the development of a dataset for Kazakh named entity recognition.\nThe dataset was built as there is a clear need for publicly available annotated\ncorpora in Kazakh, as well as annotation guidelines containing\nstraightforward--but rigorous--rules and examples. The dataset annotation,\nbased on the IOB2 scheme, was carried out on television news text by two native\nKazakh speakers under the supervision of the first author. The resulting\ndataset contains 112,702 sentences and 136,333 annotations for 25 entity\nclasses. State-of-the-art machine learning models to automatise Kazakh named\nentity recognition were also built, with the best-performing model achieving an\nexact match F1-score of 97.22% on the test set. The annotated dataset,\nguidelines, and codes used to train the models are freely available for\ndownload under the CC BY 4.0 licence from https://github.com/IS2AI/KazNERD.",
|
|
290
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2111.13419.png",
|
|
291
|
+
"numComments": 0,
|
|
292
|
+
"upvoted": false,
|
|
293
|
+
"isAuthorParticipating": false,
|
|
294
|
+
"highlightedTitle": [
|
|
295
|
+
{ "type": "text", "text": "KazNERD: " },
|
|
296
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
297
|
+
{ "type": "text", "text": " Named Entity Recognition Dataset" }
|
|
298
|
+
],
|
|
299
|
+
"highlightedSummary": [
|
|
300
|
+
{
|
|
301
|
+
"type": "text",
|
|
302
|
+
"text": "We present the development of a dataset for "
|
|
303
|
+
},
|
|
304
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
305
|
+
{
|
|
306
|
+
"type": "text",
|
|
307
|
+
"text": " named entity recognition.\nThe dataset was built as there is a clear need for publicly available annotated\ncorpora in Kazakh, as well as annotation guidelines containing\nstraightforward--but rigorous--rules and examples. The dataset annotation,\nbased on the IOB2 scheme, was carried out on television news text by two native\nKazakh speakers under the supervision of the first author. The resulting\ndataset contains 112,702 sentences and 136,333 annotations for 25 entity\nclasses. State-of-the-art machine learning models to automatise Kazakh named\nentity recognition were also built, with the best-performing model achieving an\nexact match F1-score of 97.22% on the test set. The annotated dataset,\nguidelines, and codes used to train the models are freely available for\ndownload under the CC BY 4.0 licence from https://github.com/IS2AI/KazNERD."
|
|
308
|
+
}
|
|
309
|
+
]
|
|
310
|
+
},
|
|
311
|
+
{
|
|
312
|
+
"paper": {
|
|
313
|
+
"id": "2404.01033",
|
|
314
|
+
"authors": [
|
|
315
|
+
{
|
|
316
|
+
"_id": "674a0701239ce8cdc0ce3390",
|
|
317
|
+
"name": "Adal Abilbekov",
|
|
318
|
+
"hidden": false
|
|
319
|
+
},
|
|
320
|
+
{
|
|
321
|
+
"_id": "674a0701239ce8cdc0ce3391",
|
|
322
|
+
"name": "Saida Mussakhojayeva",
|
|
323
|
+
"hidden": false
|
|
324
|
+
},
|
|
325
|
+
{
|
|
326
|
+
"_id": "674a0701239ce8cdc0ce3392",
|
|
327
|
+
"user": {
|
|
328
|
+
"_id": "6426341dad1e3b0e6e90ebd6",
|
|
329
|
+
"avatarUrl": "/avatars/449b0957c458b8aedb6e56b015852bc3.svg",
|
|
330
|
+
"isPro": false,
|
|
331
|
+
"fullname": "Rustem Yeshpanov",
|
|
332
|
+
"user": "yeshpanovrustem",
|
|
333
|
+
"type": "user"
|
|
334
|
+
},
|
|
335
|
+
"name": "Rustem Yeshpanov",
|
|
336
|
+
"status": "claimed_verified",
|
|
337
|
+
"statusLastChangedAt": "2024-11-29T21:10:05.527Z",
|
|
338
|
+
"hidden": false
|
|
339
|
+
},
|
|
340
|
+
{
|
|
341
|
+
"_id": "674a0701239ce8cdc0ce3393",
|
|
342
|
+
"name": "Huseyin Atakan Varol",
|
|
343
|
+
"hidden": false
|
|
344
|
+
}
|
|
345
|
+
],
|
|
346
|
+
"publishedAt": "2024-04-01T10:32:04.000Z",
|
|
347
|
+
"title": "KazEmoTTS: A Dataset for Kazakh Emotional Text-to-Speech Synthesis",
|
|
348
|
+
"summary": "This study focuses on the creation of the KazEmoTTS dataset, designed for\nemotional Kazakh text-to-speech (TTS) applications. KazEmoTTS is a collection\nof 54,760 audio-text pairs, with a total duration of 74.85 hours, featuring\n34.23 hours delivered by a female narrator and 40.62 hours by two male\nnarrators. The list of the emotions considered include \"neutral\", \"angry\",\n\"happy\", \"sad\", \"scared\", and \"surprised\". We also developed a TTS model\ntrained on the KazEmoTTS dataset. Objective and subjective evaluations were\nemployed to assess the quality of synthesized speech, yielding an MCD score\nwithin the range of 6.02 to 7.67, alongside a MOS that spanned from 3.51 to\n3.57. To facilitate reproducibility and inspire further research, we have made\nour code, pre-trained model, and dataset accessible in our GitHub repository.",
|
|
349
|
+
"upvotes": 0,
|
|
350
|
+
"discussionId": "674a0702239ce8cdc0ce33dc"
|
|
351
|
+
},
|
|
352
|
+
"publishedAt": "2024-04-01T06:32:04.000Z",
|
|
353
|
+
"title": "KazEmoTTS: A Dataset for Kazakh Emotional Text-to-Speech Synthesis",
|
|
354
|
+
"summary": "This study focuses on the creation of the KazEmoTTS dataset, designed for\nemotional Kazakh text-to-speech (TTS) applications. KazEmoTTS is a collection\nof 54,760 audio-text pairs, with a total duration of 74.85 hours, featuring\n34.23 hours delivered by a female narrator and 40.62 hours by two male\nnarrators. The list of the emotions considered include \"neutral\", \"angry\",\n\"happy\", \"sad\", \"scared\", and \"surprised\". We also developed a TTS model\ntrained on the KazEmoTTS dataset. Objective and subjective evaluations were\nemployed to assess the quality of synthesized speech, yielding an MCD score\nwithin the range of 6.02 to 7.67, alongside a MOS that spanned from 3.51 to\n3.57. To facilitate reproducibility and inspire further research, we have made\nour code, pre-trained model, and dataset accessible in our GitHub repository.",
|
|
355
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2404.01033.png",
|
|
356
|
+
"numComments": 0,
|
|
357
|
+
"upvoted": false,
|
|
358
|
+
"isAuthorParticipating": false,
|
|
359
|
+
"highlightedTitle": [
|
|
360
|
+
{ "type": "text", "text": "KazEmoTTS: A Dataset for " },
|
|
361
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
362
|
+
{ "type": "text", "text": " Emotional Text-to-Speech Synthesis" }
|
|
363
|
+
],
|
|
364
|
+
"highlightedSummary": [
|
|
365
|
+
{
|
|
366
|
+
"type": "text",
|
|
367
|
+
"text": "This study focuses on the creation of the KazEmoTTS dataset, designed for\nemotional "
|
|
368
|
+
},
|
|
369
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
370
|
+
{
|
|
371
|
+
"type": "text",
|
|
372
|
+
"text": " text-to-speech (TTS) applications. KazEmoTTS is a collection\nof 54,760 audio-text pairs, with a total duration of 74.85 hours, featuring\n34.23 hours delivered by a female narrator and 40.62 hours by two male\nnarrators. The list of the emotions considered include \"neutral\", \"angry\",\n\"happy\", \"sad\", \"scared\", and \"surprised\". We also developed a TTS model\ntrained on the KazEmoTTS dataset. Objective and subjective evaluations were\nemployed to assess the quality of synthesized speech, yielding an MCD score\nwithin the range of 6.02 to 7.67, alongside a MOS that spanned from 3.51 to\n3.57. To facilitate reproducibility and inspire further research, we have made\nour code, pre-trained model, and dataset accessible in our GitHub repository."
|
|
373
|
+
}
|
|
374
|
+
]
|
|
375
|
+
},
|
|
376
|
+
{
|
|
377
|
+
"paper": {
|
|
378
|
+
"id": "2502.13640",
|
|
379
|
+
"authors": [
|
|
380
|
+
{
|
|
381
|
+
"_id": "67b9e1c192e751667385cd01",
|
|
382
|
+
"name": "Maiya Goloburda",
|
|
383
|
+
"hidden": false
|
|
384
|
+
},
|
|
385
|
+
{
|
|
386
|
+
"_id": "67b9e1c192e751667385cd02",
|
|
387
|
+
"name": "Nurkhan Laiyk",
|
|
388
|
+
"hidden": false
|
|
389
|
+
},
|
|
390
|
+
{
|
|
391
|
+
"_id": "67b9e1c192e751667385cd03",
|
|
392
|
+
"name": "Diana Turmakhan",
|
|
393
|
+
"hidden": false
|
|
394
|
+
},
|
|
395
|
+
{
|
|
396
|
+
"_id": "67b9e1c192e751667385cd04",
|
|
397
|
+
"name": "Yuxia Wang",
|
|
398
|
+
"hidden": false
|
|
399
|
+
},
|
|
400
|
+
{
|
|
401
|
+
"_id": "67b9e1c192e751667385cd05",
|
|
402
|
+
"name": "Mukhammed Togmanov",
|
|
403
|
+
"hidden": false
|
|
404
|
+
},
|
|
405
|
+
{
|
|
406
|
+
"_id": "67b9e1c192e751667385cd06",
|
|
407
|
+
"user": {
|
|
408
|
+
"_id": "6509feb92257a3afbaeecfea",
|
|
409
|
+
"avatarUrl": "https://cdn-avatars.huggingface.co/v1/production/uploads/6509feb92257a3afbaeecfea/a_UbA-2WtZeLTf0ugVzSh.jpeg",
|
|
410
|
+
"isPro": false,
|
|
411
|
+
"fullname": "Jonibek Mansurov",
|
|
412
|
+
"user": "MJonibek",
|
|
413
|
+
"type": "user"
|
|
414
|
+
},
|
|
415
|
+
"name": "Jonibek Mansurov",
|
|
416
|
+
"status": "claimed_verified",
|
|
417
|
+
"statusLastChangedAt": "2025-02-26T08:38:13.169Z",
|
|
418
|
+
"hidden": false
|
|
419
|
+
},
|
|
420
|
+
{
|
|
421
|
+
"_id": "67b9e1c192e751667385cd07",
|
|
422
|
+
"name": "Askhat Sametov",
|
|
423
|
+
"hidden": false
|
|
424
|
+
},
|
|
425
|
+
{
|
|
426
|
+
"_id": "67b9e1c192e751667385cd08",
|
|
427
|
+
"name": "Nurdaulet Mukhituly",
|
|
428
|
+
"hidden": false
|
|
429
|
+
},
|
|
430
|
+
{
|
|
431
|
+
"_id": "67b9e1c192e751667385cd09",
|
|
432
|
+
"name": "Minghan Wang",
|
|
433
|
+
"hidden": false
|
|
434
|
+
},
|
|
435
|
+
{
|
|
436
|
+
"_id": "67b9e1c192e751667385cd0a",
|
|
437
|
+
"name": "Daniil Orel",
|
|
438
|
+
"hidden": false
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"_id": "67b9e1c192e751667385cd0b",
|
|
442
|
+
"user": {
|
|
443
|
+
"_id": "637e8b1b66ee00bcb2468ed0",
|
|
444
|
+
"avatarUrl": "https://cdn-avatars.huggingface.co/v1/production/uploads/1669240174964-637e8b1b66ee00bcb2468ed0.jpeg",
|
|
445
|
+
"isPro": false,
|
|
446
|
+
"fullname": "Zain",
|
|
447
|
+
"user": "zainmujahid",
|
|
448
|
+
"type": "user"
|
|
449
|
+
},
|
|
450
|
+
"name": "Zain Muhammad Mujahid",
|
|
451
|
+
"status": "claimed_verified",
|
|
452
|
+
"statusLastChangedAt": "2025-02-24T09:20:09.961Z",
|
|
453
|
+
"hidden": false
|
|
454
|
+
},
|
|
455
|
+
{
|
|
456
|
+
"_id": "67b9e1c192e751667385cd0c",
|
|
457
|
+
"name": "Fajri Koto",
|
|
458
|
+
"hidden": false
|
|
459
|
+
},
|
|
460
|
+
{
|
|
461
|
+
"_id": "67b9e1c192e751667385cd0d",
|
|
462
|
+
"name": "Timothy Baldwin",
|
|
463
|
+
"hidden": false
|
|
464
|
+
},
|
|
465
|
+
{
|
|
466
|
+
"_id": "67b9e1c192e751667385cd0e",
|
|
467
|
+
"name": "Preslav Nakov",
|
|
468
|
+
"hidden": false
|
|
469
|
+
}
|
|
470
|
+
],
|
|
471
|
+
"publishedAt": "2025-02-19T11:33:22.000Z",
|
|
472
|
+
"title": "Qorgau: Evaluating LLM Safety in Kazakh-Russian Bilingual Contexts",
|
|
473
|
+
"summary": "Large language models (LLMs) are known to have the potential to generate\nharmful content, posing risks to users. While significant progress has been\nmade in developing taxonomies for LLM risks and safety evaluation prompts, most\nstudies have focused on monolingual contexts, primarily in English. However,\nlanguage- and region-specific risks in bilingual contexts are often overlooked,\nand core findings can diverge from those in monolingual settings. In this\npaper, we introduce Qorgau, a novel dataset specifically designed for safety\nevaluation in Kazakh and Russian, reflecting the unique bilingual context in\nKazakhstan, where both Kazakh (a low-resource language) and Russian (a\nhigh-resource language) are spoken. Experiments with both multilingual and\nlanguage-specific LLMs reveal notable differences in safety performance,\nemphasizing the need for tailored, region-specific datasets to ensure the\nresponsible and safe deployment of LLMs in countries like Kazakhstan. Warning:\nthis paper contains example data that may be offensive, harmful, or biased.",
|
|
474
|
+
"upvotes": 0,
|
|
475
|
+
"discussionId": "67b9e1c292e751667385cd52",
|
|
476
|
+
"ai_keywords": [
|
|
477
|
+
"transformers",
|
|
478
|
+
"LLM risks",
|
|
479
|
+
"safety Evaluationence prompts",
|
|
480
|
+
"monolingual contexts",
|
|
481
|
+
"language-specific risks",
|
|
482
|
+
"bilingual contexts",
|
|
483
|
+
"safety evaluation",
|
|
484
|
+
"dataset",
|
|
485
|
+
"Kazakh",
|
|
486
|
+
"Russian",
|
|
487
|
+
"low-resource language",
|
|
488
|
+
"high-resource language",
|
|
489
|
+
"safety performances",
|
|
490
|
+
"region-specific datasets",
|
|
491
|
+
"multilingual LLMs",
|
|
492
|
+
"language-specific LLMs",
|
|
493
|
+
"responsible deployment",
|
|
494
|
+
"LLM deployment"
|
|
495
|
+
]
|
|
496
|
+
},
|
|
497
|
+
"publishedAt": "2025-02-19T06:33:22.000Z",
|
|
498
|
+
"title": "Qorgau: Evaluating LLM Safety in Kazakh-Russian Bilingual Contexts",
|
|
499
|
+
"summary": "Large language models (LLMs) are known to have the potential to generate\nharmful content, posing risks to users. While significant progress has been\nmade in developing taxonomies for LLM risks and safety evaluation prompts, most\nstudies have focused on monolingual contexts, primarily in English. However,\nlanguage- and region-specific risks in bilingual contexts are often overlooked,\nand core findings can diverge from those in monolingual settings. In this\npaper, we introduce Qorgau, a novel dataset specifically designed for safety\nevaluation in Kazakh and Russian, reflecting the unique bilingual context in\nKazakhstan, where both Kazakh (a low-resource language) and Russian (a\nhigh-resource language) are spoken. Experiments with both multilingual and\nlanguage-specific LLMs reveal notable differences in safety performance,\nemphasizing the need for tailored, region-specific datasets to ensure the\nresponsible and safe deployment of LLMs in countries like Kazakhstan. Warning:\nthis paper contains example data that may be offensive, harmful, or biased.",
|
|
500
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2502.13640.png",
|
|
501
|
+
"numComments": 0,
|
|
502
|
+
"upvoted": false,
|
|
503
|
+
"isAuthorParticipating": false,
|
|
504
|
+
"highlightedTitle": [
|
|
505
|
+
{ "type": "text", "text": "Qorgau: Evaluating LLM Safety in " },
|
|
506
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
507
|
+
{ "type": "text", "text": "-Russian Bilingual Contexts" }
|
|
508
|
+
],
|
|
509
|
+
"highlightedSummary": [
|
|
510
|
+
{
|
|
511
|
+
"type": "text",
|
|
512
|
+
"text": "Large language models (LLMs) are known to have the potential to generate\nharmful content, posing risks to users. While significant progress has been\nmade in developing taxonomies for LLM risks and safety evaluation prompts, most\nstudies have focused on monolingual contexts, primarily in English. However,\nlanguage- and region-specific risks in bilingual contexts are often overlooked,\nand core findings can diverge from those in monolingual settings. In this\npaper, we introduce Qorgau, a novel dataset specifically designed for safety\nevaluation in Kazakh and Russian, reflecting the unique bilingual context in\nKazakhstan, where both Kazakh (a low-resource language) and Russian (a\nhigh-resource language) are spoken. Experiments with both multilingual and\nlanguage-specific LLMs reveal notable differences in safety performance,\nemphasizing the need for tailored, region-specific datasets to ensure the\nresponsible and safe deployment of LLMs in countries like Kazakhstan. Warning:\nthis paper contains example data that may be offensive, harmful, or biased."
|
|
513
|
+
}
|
|
514
|
+
]
|
|
515
|
+
},
|
|
516
|
+
{
|
|
517
|
+
"paper": {
|
|
518
|
+
"id": "2503.01493",
|
|
519
|
+
"authors": [
|
|
520
|
+
{
|
|
521
|
+
"_id": "67c7e45823ded64a09851f7d",
|
|
522
|
+
"name": "Fajri Koto",
|
|
523
|
+
"hidden": false
|
|
524
|
+
},
|
|
525
|
+
{
|
|
526
|
+
"_id": "67c7e45823ded64a09851f7e",
|
|
527
|
+
"name": "Rituraj Joshi",
|
|
528
|
+
"hidden": false
|
|
529
|
+
},
|
|
530
|
+
{
|
|
531
|
+
"_id": "67c7e45823ded64a09851f7f",
|
|
532
|
+
"name": "Nurdaulet Mukhituly",
|
|
533
|
+
"hidden": false
|
|
534
|
+
},
|
|
535
|
+
{
|
|
536
|
+
"_id": "67c7e45823ded64a09851f80",
|
|
537
|
+
"name": "Yuxia Wang",
|
|
538
|
+
"hidden": false
|
|
539
|
+
},
|
|
540
|
+
{
|
|
541
|
+
"_id": "67c7e45823ded64a09851f81",
|
|
542
|
+
"name": "Zhuohan Xie",
|
|
543
|
+
"hidden": false
|
|
544
|
+
},
|
|
545
|
+
{
|
|
546
|
+
"_id": "67c7e45823ded64a09851f82",
|
|
547
|
+
"name": "Rahul Pal",
|
|
548
|
+
"hidden": false
|
|
549
|
+
},
|
|
550
|
+
{
|
|
551
|
+
"_id": "67c7e45823ded64a09851f83",
|
|
552
|
+
"name": "Daniil Orel",
|
|
553
|
+
"hidden": false
|
|
554
|
+
},
|
|
555
|
+
{
|
|
556
|
+
"_id": "67c7e45823ded64a09851f84",
|
|
557
|
+
"name": "Parvez Mullah",
|
|
558
|
+
"hidden": false
|
|
559
|
+
},
|
|
560
|
+
{
|
|
561
|
+
"_id": "67c7e45823ded64a09851f85",
|
|
562
|
+
"name": "Diana Turmakhan",
|
|
563
|
+
"hidden": false
|
|
564
|
+
},
|
|
565
|
+
{
|
|
566
|
+
"_id": "67c7e45823ded64a09851f86",
|
|
567
|
+
"name": "Maiya Goloburda",
|
|
568
|
+
"hidden": false
|
|
569
|
+
},
|
|
570
|
+
{
|
|
571
|
+
"_id": "67c7e45823ded64a09851f87",
|
|
572
|
+
"name": "Mohammed Kamran",
|
|
573
|
+
"hidden": false
|
|
574
|
+
},
|
|
575
|
+
{
|
|
576
|
+
"_id": "67c7e45823ded64a09851f88",
|
|
577
|
+
"name": "Samujjwal Ghosh",
|
|
578
|
+
"hidden": false
|
|
579
|
+
},
|
|
580
|
+
{
|
|
581
|
+
"_id": "67c7e45823ded64a09851f89",
|
|
582
|
+
"name": "Bokang Jia",
|
|
583
|
+
"hidden": false
|
|
584
|
+
},
|
|
585
|
+
{
|
|
586
|
+
"_id": "67c7e45823ded64a09851f8a",
|
|
587
|
+
"name": "Jonibek Mansurov",
|
|
588
|
+
"hidden": false
|
|
589
|
+
},
|
|
590
|
+
{
|
|
591
|
+
"_id": "67c7e45823ded64a09851f8b",
|
|
592
|
+
"name": "Mukhammed Togmanov",
|
|
593
|
+
"hidden": false
|
|
594
|
+
},
|
|
595
|
+
{
|
|
596
|
+
"_id": "67c7e45823ded64a09851f8c",
|
|
597
|
+
"name": "Debopriyo Banerjee",
|
|
598
|
+
"hidden": false
|
|
599
|
+
},
|
|
600
|
+
{
|
|
601
|
+
"_id": "67c7e45823ded64a09851f8d",
|
|
602
|
+
"name": "Nurkhan Laiyk",
|
|
603
|
+
"hidden": false
|
|
604
|
+
},
|
|
605
|
+
{
|
|
606
|
+
"_id": "67c7e45823ded64a09851f8e",
|
|
607
|
+
"name": "Akhmed Sakip",
|
|
608
|
+
"hidden": false
|
|
609
|
+
},
|
|
610
|
+
{
|
|
611
|
+
"_id": "67c7e45823ded64a09851f8f",
|
|
612
|
+
"name": "Xudong Han",
|
|
613
|
+
"hidden": false
|
|
614
|
+
},
|
|
615
|
+
{
|
|
616
|
+
"_id": "67c7e45823ded64a09851f90",
|
|
617
|
+
"name": "Ekaterina Kochmar",
|
|
618
|
+
"hidden": false
|
|
619
|
+
},
|
|
620
|
+
{
|
|
621
|
+
"_id": "67c7e45823ded64a09851f91",
|
|
622
|
+
"name": "Alham Fikri Aji",
|
|
623
|
+
"hidden": false
|
|
624
|
+
},
|
|
625
|
+
{
|
|
626
|
+
"_id": "67c7e45823ded64a09851f92",
|
|
627
|
+
"name": "Aaryamonvikram Singh",
|
|
628
|
+
"hidden": false
|
|
629
|
+
},
|
|
630
|
+
{
|
|
631
|
+
"_id": "67c7e45823ded64a09851f93",
|
|
632
|
+
"name": "Alok Anil Jadhav",
|
|
633
|
+
"hidden": false
|
|
634
|
+
},
|
|
635
|
+
{
|
|
636
|
+
"_id": "67c7e45823ded64a09851f94",
|
|
637
|
+
"name": "Satheesh Katipomu",
|
|
638
|
+
"hidden": false
|
|
639
|
+
},
|
|
640
|
+
{
|
|
641
|
+
"_id": "67c7e45823ded64a09851f95",
|
|
642
|
+
"name": "Samta Kamboj",
|
|
643
|
+
"hidden": false
|
|
644
|
+
},
|
|
645
|
+
{
|
|
646
|
+
"_id": "67c7e45823ded64a09851f96",
|
|
647
|
+
"name": "Monojit Choudhury",
|
|
648
|
+
"hidden": false
|
|
649
|
+
},
|
|
650
|
+
{
|
|
651
|
+
"_id": "67c7e45823ded64a09851f97",
|
|
652
|
+
"name": "Gurpreet Gosal",
|
|
653
|
+
"hidden": false
|
|
654
|
+
},
|
|
655
|
+
{
|
|
656
|
+
"_id": "67c7e45823ded64a09851f98",
|
|
657
|
+
"name": "Gokul Ramakrishnan",
|
|
658
|
+
"hidden": false
|
|
659
|
+
},
|
|
660
|
+
{
|
|
661
|
+
"_id": "67c7e45823ded64a09851f99",
|
|
662
|
+
"name": "Biswajit Mishra",
|
|
663
|
+
"hidden": false
|
|
664
|
+
},
|
|
665
|
+
{
|
|
666
|
+
"_id": "67c7e45823ded64a09851f9a",
|
|
667
|
+
"name": "Sarath Chandran",
|
|
668
|
+
"hidden": false
|
|
669
|
+
},
|
|
670
|
+
{
|
|
671
|
+
"_id": "67c7e45823ded64a09851f9b",
|
|
672
|
+
"name": "Avraham Sheinin",
|
|
673
|
+
"hidden": false
|
|
674
|
+
},
|
|
675
|
+
{
|
|
676
|
+
"_id": "67c7e45823ded64a09851f9c",
|
|
677
|
+
"name": "Natalia Vassilieva",
|
|
678
|
+
"hidden": false
|
|
679
|
+
},
|
|
680
|
+
{
|
|
681
|
+
"_id": "67c7e45823ded64a09851f9d",
|
|
682
|
+
"name": "Neha Sengupta",
|
|
683
|
+
"hidden": false
|
|
684
|
+
},
|
|
685
|
+
{
|
|
686
|
+
"_id": "67c7e45823ded64a09851f9e",
|
|
687
|
+
"name": "Larry Murray",
|
|
688
|
+
"hidden": false
|
|
689
|
+
},
|
|
690
|
+
{
|
|
691
|
+
"_id": "67c7e45823ded64a09851f9f",
|
|
692
|
+
"name": "Preslav Nakov",
|
|
693
|
+
"hidden": false
|
|
694
|
+
}
|
|
695
|
+
],
|
|
696
|
+
"publishedAt": "2025-03-03T13:05:48.000Z",
|
|
697
|
+
"title": "Llama-3.1-Sherkala-8B-Chat: An Open Large Language Model for Kazakh",
|
|
698
|
+
"summary": "Llama-3.1-Sherkala-8B-Chat, or Sherkala-Chat (8B) for short, is a\nstate-of-the-art instruction-tuned open generative large language model (LLM)\ndesigned for Kazakh. Sherkala-Chat (8B) aims to enhance the inclusivity of LLM\nadvancements for Kazakh speakers. Adapted from the LLaMA-3.1-8B model,\nSherkala-Chat (8B) is trained on 45.3B tokens across Kazakh, English, Russian,\nand Turkish. With 8 billion parameters, it demonstrates strong knowledge and\nreasoning abilities in Kazakh, significantly outperforming existing open Kazakh\nand multilingual models of similar scale while achieving competitive\nperformance in English. We release Sherkala-Chat (8B) as an open-weight\ninstruction-tuned model and provide a detailed overview of its training,\nfine-tuning, safety alignment, and evaluation, aiming to advance research and\nsupport diverse real-world applications.",
|
|
699
|
+
"upvotes": 1,
|
|
700
|
+
"discussionId": "67c7e45e23ded64a09852231",
|
|
701
|
+
"ai_keywords": [
|
|
702
|
+
"instruction-tuned",
|
|
703
|
+
"open generative large language model",
|
|
704
|
+
"LLaMA-3.1-8B",
|
|
705
|
+
"token",
|
|
706
|
+
"parameter-efficient fine-tuning",
|
|
707
|
+
"safety alignment"
|
|
708
|
+
]
|
|
709
|
+
},
|
|
710
|
+
"publishedAt": "2025-03-03T08:05:48.000Z",
|
|
711
|
+
"title": "Llama-3.1-Sherkala-8B-Chat: An Open Large Language Model for Kazakh",
|
|
712
|
+
"summary": "Llama-3.1-Sherkala-8B-Chat, or Sherkala-Chat (8B) for short, is a\nstate-of-the-art instruction-tuned open generative large language model (LLM)\ndesigned for Kazakh. Sherkala-Chat (8B) aims to enhance the inclusivity of LLM\nadvancements for Kazakh speakers. Adapted from the LLaMA-3.1-8B model,\nSherkala-Chat (8B) is trained on 45.3B tokens across Kazakh, English, Russian,\nand Turkish. With 8 billion parameters, it demonstrates strong knowledge and\nreasoning abilities in Kazakh, significantly outperforming existing open Kazakh\nand multilingual models of similar scale while achieving competitive\nperformance in English. We release Sherkala-Chat (8B) as an open-weight\ninstruction-tuned model and provide a detailed overview of its training,\nfine-tuning, safety alignment, and evaluation, aiming to advance research and\nsupport diverse real-world applications.",
|
|
713
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2503.01493.png",
|
|
714
|
+
"numComments": 0,
|
|
715
|
+
"upvoted": false,
|
|
716
|
+
"isAuthorParticipating": false,
|
|
717
|
+
"highlightedTitle": [
|
|
718
|
+
{
|
|
719
|
+
"type": "text",
|
|
720
|
+
"text": "Llama-3.1-Sherkala-8B-Chat: An Open Large Language Model for "
|
|
721
|
+
},
|
|
722
|
+
{ "type": "highlight", "text": "Kazakh" }
|
|
723
|
+
],
|
|
724
|
+
"highlightedSummary": [
|
|
725
|
+
{
|
|
726
|
+
"type": "text",
|
|
727
|
+
"text": "Llama-3.1-Sherkala-8B-Chat, or Sherkala-Chat (8B) for short, is a\nstate-of-the-art instruction-tuned open generative large language model (LLM)\ndesigned for "
|
|
728
|
+
},
|
|
729
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
730
|
+
{
|
|
731
|
+
"type": "text",
|
|
732
|
+
"text": ". Sherkala-Chat (8B) aims to enhance the inclusivity of LLM\nadvancements for Kazakh speakers. Adapted from the LLaMA-3.1-8B model,\nSherkala-Chat (8B) is trained on 45.3B tokens across Kazakh, English, Russian,\nand Turkish. With 8 billion parameters, it demonstrates strong knowledge and\nreasoning abilities in Kazakh, significantly outperforming existing open Kazakh\nand multilingual models of similar scale while achieving competitive\nperformance in English. We release Sherkala-Chat (8B) as an open-weight\ninstruction-tuned model and provide a detailed overview of its training,\nfine-tuning, safety alignment, and evaluation, aiming to advance research and\nsupport diverse real-world applications."
|
|
733
|
+
}
|
|
734
|
+
]
|
|
735
|
+
},
|
|
736
|
+
{
|
|
737
|
+
"paper": {
|
|
738
|
+
"id": "2108.01280",
|
|
739
|
+
"authors": [
|
|
740
|
+
{
|
|
741
|
+
"_id": "678a5b3060c77cb881de2a69",
|
|
742
|
+
"name": "Saida Mussakhojayeva",
|
|
743
|
+
"hidden": false
|
|
744
|
+
},
|
|
745
|
+
{
|
|
746
|
+
"_id": "678a5b3060c77cb881de2a6a",
|
|
747
|
+
"name": "Yerbolat Khassanov",
|
|
748
|
+
"hidden": false
|
|
749
|
+
},
|
|
750
|
+
{
|
|
751
|
+
"_id": "678a5b3060c77cb881de2a6b",
|
|
752
|
+
"name": "Huseyin Atakan Varol",
|
|
753
|
+
"hidden": false
|
|
754
|
+
}
|
|
755
|
+
],
|
|
756
|
+
"publishedAt": "2021-08-03T04:04:01.000Z",
|
|
757
|
+
"title": "A Study of Multilingual End-to-End Speech Recognition for Kazakh,\n Russian, and English",
|
|
758
|
+
"summary": "We study training a single end-to-end (E2E) automatic speech recognition\n(ASR) model for three languages used in Kazakhstan: Kazakh, Russian, and\nEnglish. We first describe the development of multilingual E2E ASR based on\nTransformer networks and then perform an extensive assessment on the\naforementioned languages. We also compare two variants of output grapheme set\nconstruction: combined and independent. Furthermore, we evaluate the impact of\nLMs and data augmentation techniques on the recognition performance of the\nmultilingual E2E ASR. In addition, we present several datasets for training and\nevaluation purposes. Experiment results show that the multilingual models\nachieve comparable performances to the monolingual baselines with a similar\nnumber of parameters. Our best monolingual and multilingual models achieved\n20.9% and 20.5% average word error rates on the combined test set,\nrespectively. To ensure the reproducibility of our experiments and results, we\nshare our training recipes, datasets, and pre-trained models.",
|
|
759
|
+
"upvotes": 0,
|
|
760
|
+
"discussionId": "678a5b3060c77cb881de2aa2",
|
|
761
|
+
"ai_keywords": [
|
|
762
|
+
"end-to-end (E2E) automatic speech recognition (ASR)",
|
|
763
|
+
"multilingual E2E ASR",
|
|
764
|
+
"Transformer networks",
|
|
765
|
+
"output grapheme set construction",
|
|
766
|
+
"language models (LMs)",
|
|
767
|
+
"data augmentation techniques",
|
|
768
|
+
"decoding",
|
|
769
|
+
"evaluation",
|
|
770
|
+
"word error rates (WER)",
|
|
771
|
+
"monolingual baselines",
|
|
772
|
+
"training recipes",
|
|
773
|
+
"pre-trained models"
|
|
774
|
+
]
|
|
775
|
+
},
|
|
776
|
+
"publishedAt": "2021-08-03T00:04:01.000Z",
|
|
777
|
+
"title": "A Study of Multilingual End-to-End Speech Recognition for Kazakh,\n Russian, and English",
|
|
778
|
+
"summary": "We study training a single end-to-end (E2E) automatic speech recognition\n(ASR) model for three languages used in Kazakhstan: Kazakh, Russian, and\nEnglish. We first describe the development of multilingual E2E ASR based on\nTransformer networks and then perform an extensive assessment on the\naforementioned languages. We also compare two variants of output grapheme set\nconstruction: combined and independent. Furthermore, we evaluate the impact of\nLMs and data augmentation techniques on the recognition performance of the\nmultilingual E2E ASR. In addition, we present several datasets for training and\nevaluation purposes. Experiment results show that the multilingual models\nachieve comparable performances to the monolingual baselines with a similar\nnumber of parameters. Our best monolingual and multilingual models achieved\n20.9% and 20.5% average word error rates on the combined test set,\nrespectively. To ensure the reproducibility of our experiments and results, we\nshare our training recipes, datasets, and pre-trained models.",
|
|
779
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2108.01280.png",
|
|
780
|
+
"numComments": 0,
|
|
781
|
+
"upvoted": false,
|
|
782
|
+
"isAuthorParticipating": false,
|
|
783
|
+
"highlightedTitle": [
|
|
784
|
+
{
|
|
785
|
+
"type": "text",
|
|
786
|
+
"text": "A Study of Multilingual End-to-End Speech Recognition for "
|
|
787
|
+
},
|
|
788
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
789
|
+
{ "type": "text", "text": ",\n Russian, and English" }
|
|
790
|
+
],
|
|
791
|
+
"highlightedSummary": [
|
|
792
|
+
{
|
|
793
|
+
"type": "text",
|
|
794
|
+
"text": "We study training a single end-to-end (E2E) automatic speech recognition\n(ASR) model for three languages used in "
|
|
795
|
+
},
|
|
796
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
797
|
+
{ "type": "text", "text": "stan: " },
|
|
798
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
799
|
+
{
|
|
800
|
+
"type": "text",
|
|
801
|
+
"text": ", Russian, and\nEnglish. We first describe the development of multilingual E2E ASR based on\nTransformer networks and then perform an extensive assessment on the\naforementioned languages. We also compare two variants of output grapheme set\nconstruction: combined and independent. Furthermore, we evaluate the impact of\nLMs and data augmentation techniques on the recognition performance of the\nmultilingual E2E ASR. In addition, we present several datasets for training and\nevaluation purposes. Experiment results show that the multilingual models\nachieve comparable performances to the monolingual baselines with a similar\nnumber of parameters. Our best monolingual and multilingual models achieved\n20.9% and 20.5% average word error rates on the combined test set,\nrespectively. To ensure the reproducibility of our experiments and results, we\nshare our training recipes, datasets, and pre-trained models."
|
|
802
|
+
}
|
|
803
|
+
]
|
|
804
|
+
},
|
|
805
|
+
{
|
|
806
|
+
"paper": {
|
|
807
|
+
"id": "2107.10637",
|
|
808
|
+
"authors": [
|
|
809
|
+
{
|
|
810
|
+
"_id": "64f63bf05f2dee8a6b87948e",
|
|
811
|
+
"user": {
|
|
812
|
+
"_id": "619957e7d7f09e0d8b7714fa",
|
|
813
|
+
"avatarUrl": "/avatars/71120837c2e7816f3ec8b2c42978ce9f.svg",
|
|
814
|
+
"isPro": false,
|
|
815
|
+
"fullname": "Ilnar Salimzianov",
|
|
816
|
+
"user": "ifs",
|
|
817
|
+
"type": "user"
|
|
818
|
+
},
|
|
819
|
+
"name": "Ilnar Salimzianov",
|
|
820
|
+
"status": "extracted_pending",
|
|
821
|
+
"statusLastChangedAt": "2023-09-04T20:20:01.065Z",
|
|
822
|
+
"hidden": false
|
|
823
|
+
}
|
|
824
|
+
],
|
|
825
|
+
"publishedAt": "2021-07-19T14:17:42.000Z",
|
|
826
|
+
"title": "A baseline model for computationally inexpensive speech recognition for\n Kazakh using the Coqui STT framework",
|
|
827
|
+
"summary": "Mobile devices are transforming the way people interact with computers, and\nspeech interfaces to applications are ever more important. Automatic Speech\nRecognition systems recently published are very accurate, but often require\npowerful machinery (specialised Graphical Processing Units) for inference,\nwhich makes them impractical to run on commodity devices, especially in\nstreaming mode. Impressed by the accuracy of, but dissatisfied with the\ninference times of the baseline Kazakh ASR model of (Khassanov et al.,2021)\nwhen not using a GPU, we trained a new baseline acoustic model (on the same\ndataset as the aforementioned paper) and three language models for use with the\nCoqui STT framework. Results look promising, but further epochs of training and\nparameter sweeping or, alternatively, limiting the vocabulary that the ASR\nsystem must support, is needed to reach a production-level accuracy.",
|
|
828
|
+
"upvotes": 0,
|
|
829
|
+
"discussionId": "64f63bf15f2dee8a6b87949a",
|
|
830
|
+
"ai_keywords": ["Automatic Speech Recognition", "acoustic model", "language models", "Coqui STT framework"]
|
|
831
|
+
},
|
|
832
|
+
"publishedAt": "2021-07-19T10:17:42.000Z",
|
|
833
|
+
"title": "A baseline model for computationally inexpensive speech recognition for\n Kazakh using the Coqui STT framework",
|
|
834
|
+
"summary": "Mobile devices are transforming the way people interact with computers, and\nspeech interfaces to applications are ever more important. Automatic Speech\nRecognition systems recently published are very accurate, but often require\npowerful machinery (specialised Graphical Processing Units) for inference,\nwhich makes them impractical to run on commodity devices, especially in\nstreaming mode. Impressed by the accuracy of, but dissatisfied with the\ninference times of the baseline Kazakh ASR model of (Khassanov et al.,2021)\nwhen not using a GPU, we trained a new baseline acoustic model (on the same\ndataset as the aforementioned paper) and three language models for use with the\nCoqui STT framework. Results look promising, but further epochs of training and\nparameter sweeping or, alternatively, limiting the vocabulary that the ASR\nsystem must support, is needed to reach a production-level accuracy.",
|
|
835
|
+
"thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2107.10637.png",
|
|
836
|
+
"numComments": 0,
|
|
837
|
+
"upvoted": false,
|
|
838
|
+
"isAuthorParticipating": false,
|
|
839
|
+
"highlightedTitle": [
|
|
840
|
+
{
|
|
841
|
+
"type": "text",
|
|
842
|
+
"text": "A baseline model for computationally inexpensive speech recognition for\n "
|
|
843
|
+
},
|
|
844
|
+
{ "type": "highlight", "text": "Kazakh" },
|
|
845
|
+
{ "type": "text", "text": " using the Coqui STT framework" }
|
|
846
|
+
],
|
|
847
|
+
"highlightedSummary": [
|
|
848
|
+
{
|
|
849
|
+
"type": "text",
|
|
850
|
+
"text": "Mobile devices are transforming the way people interact with computers, and\nspeech interfaces to applications are ever more important. Automatic Speech\nRecognition systems recently published are very accurate, but often require\npowerful machinery (specialised Graphical Processing Units) for inference,\nwhich makes them impractical to run on commodity devices, especially in\nstreaming mode. Impressed by the accuracy of, but dissatisfied with the\ninference times of the baseline Kazakh ASR model of (Khassanov et al.,2021)\nwhen not using a GPU, we trained a new baseline acoustic model (on the same\ndataset as the aforementioned paper) and three language models for use with the\nCoqui STT framework. Results look promising, but further epochs of training and\nparameter sweeping or, alternatively, limiting the vocabulary that the ASR\nsystem must support, is needed to reach a production-level accuracy."
|
|
851
|
+
}
|
|
852
|
+
]
|
|
853
|
+
}
|
|
854
|
+
]
|