@huggingface/tasks 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +20 -0
- package/dist/index.d.ts +358 -46
- package/dist/index.js +103 -41
- package/dist/{index.cjs → index.mjs} +73 -68
- package/package.json +43 -33
- package/src/Types.ts +49 -43
- package/src/audio-classification/about.md +5 -5
- package/src/audio-classification/data.ts +11 -11
- package/src/audio-to-audio/about.md +4 -3
- package/src/audio-to-audio/data.ts +18 -15
- package/src/automatic-speech-recognition/about.md +5 -4
- package/src/automatic-speech-recognition/data.ts +18 -17
- package/src/const.ts +52 -44
- package/src/conversational/about.md +9 -9
- package/src/conversational/data.ts +22 -18
- package/src/depth-estimation/about.md +1 -3
- package/src/depth-estimation/data.ts +11 -11
- package/src/document-question-answering/about.md +1 -2
- package/src/document-question-answering/data.ts +22 -19
- package/src/feature-extraction/about.md +2 -3
- package/src/feature-extraction/data.ts +12 -15
- package/src/fill-mask/about.md +1 -1
- package/src/fill-mask/data.ts +16 -14
- package/src/image-classification/about.md +5 -3
- package/src/image-classification/data.ts +15 -15
- package/src/image-segmentation/about.md +4 -4
- package/src/image-segmentation/data.ts +26 -23
- package/src/image-to-image/about.md +8 -10
- package/src/image-to-image/data.ts +31 -27
- package/src/image-to-text/about.md +13 -6
- package/src/image-to-text/data.ts +20 -21
- package/src/index.ts +2 -0
- package/src/modelLibraries.ts +43 -0
- package/src/object-detection/about.md +2 -1
- package/src/object-detection/data.ts +20 -17
- package/src/pipelines.ts +608 -0
- package/src/placeholder/about.md +3 -3
- package/src/placeholder/data.ts +8 -8
- package/src/question-answering/about.md +1 -1
- package/src/question-answering/data.ts +21 -19
- package/src/reinforcement-learning/about.md +167 -176
- package/src/reinforcement-learning/data.ts +75 -78
- package/src/sentence-similarity/data.ts +29 -28
- package/src/summarization/about.md +6 -5
- package/src/summarization/data.ts +23 -20
- package/src/table-question-answering/about.md +5 -5
- package/src/table-question-answering/data.ts +35 -39
- package/src/tabular-classification/about.md +4 -6
- package/src/tabular-classification/data.ts +11 -12
- package/src/tabular-regression/about.md +14 -18
- package/src/tabular-regression/data.ts +10 -11
- package/src/tasksData.ts +47 -50
- package/src/text-classification/about.md +5 -4
- package/src/text-classification/data.ts +21 -20
- package/src/text-generation/about.md +7 -6
- package/src/text-generation/data.ts +36 -34
- package/src/text-to-image/about.md +19 -18
- package/src/text-to-image/data.ts +32 -26
- package/src/text-to-speech/about.md +4 -5
- package/src/text-to-speech/data.ts +16 -17
- package/src/text-to-video/about.md +41 -36
- package/src/text-to-video/data.ts +43 -38
- package/src/token-classification/about.md +1 -3
- package/src/token-classification/data.ts +26 -25
- package/src/translation/about.md +4 -4
- package/src/translation/data.ts +21 -21
- package/src/unconditional-image-generation/about.md +10 -5
- package/src/unconditional-image-generation/data.ts +26 -20
- package/src/video-classification/about.md +5 -1
- package/src/video-classification/data.ts +14 -14
- package/src/visual-question-answering/about.md +8 -3
- package/src/visual-question-answering/data.ts +22 -19
- package/src/zero-shot-classification/about.md +5 -4
- package/src/zero-shot-classification/data.ts +20 -20
- package/src/zero-shot-image-classification/about.md +17 -9
- package/src/zero-shot-image-classification/data.ts +12 -14
- package/tsconfig.json +18 -0
- package/assets/audio-classification/audio.wav +0 -0
- package/assets/audio-to-audio/input.wav +0 -0
- package/assets/audio-to-audio/label-0.wav +0 -0
- package/assets/audio-to-audio/label-1.wav +0 -0
- package/assets/automatic-speech-recognition/input.flac +0 -0
- package/assets/automatic-speech-recognition/wav2vec2.png +0 -0
- package/assets/contribution-guide/anatomy.png +0 -0
- package/assets/contribution-guide/libraries.png +0 -0
- package/assets/depth-estimation/depth-estimation-input.jpg +0 -0
- package/assets/depth-estimation/depth-estimation-output.png +0 -0
- package/assets/document-question-answering/document-question-answering-input.png +0 -0
- package/assets/image-classification/image-classification-input.jpeg +0 -0
- package/assets/image-segmentation/image-segmentation-input.jpeg +0 -0
- package/assets/image-segmentation/image-segmentation-output.png +0 -0
- package/assets/image-to-image/image-to-image-input.jpeg +0 -0
- package/assets/image-to-image/image-to-image-output.png +0 -0
- package/assets/image-to-image/pix2pix_examples.jpg +0 -0
- package/assets/image-to-text/savanna.jpg +0 -0
- package/assets/object-detection/object-detection-input.jpg +0 -0
- package/assets/object-detection/object-detection-output.jpg +0 -0
- package/assets/table-question-answering/tableQA.jpg +0 -0
- package/assets/text-to-image/image.jpeg +0 -0
- package/assets/text-to-speech/audio.wav +0 -0
- package/assets/text-to-video/text-to-video-output.gif +0 -0
- package/assets/unconditional-image-generation/unconditional-image-generation-output.jpeg +0 -0
- package/assets/video-classification/video-classification-input.gif +0 -0
- package/assets/visual-question-answering/elephant.jpeg +0 -0
- package/assets/zero-shot-image-classification/image-classification-input.jpeg +0 -0
- package/dist/index.d.cts +0 -145
|
@@ -5,65 +5,67 @@ const taskData: TaskDataCustom = {
|
|
|
5
5
|
{
|
|
6
6
|
// TODO write proper description
|
|
7
7
|
description: "A famous question answering dataset based on English articles from Wikipedia.",
|
|
8
|
-
id:
|
|
8
|
+
id: "squad_v2",
|
|
9
9
|
},
|
|
10
10
|
{
|
|
11
11
|
// TODO write proper description
|
|
12
12
|
description: "A dataset of aggregated anonymized actual queries issued to the Google search engine.",
|
|
13
|
-
id:
|
|
13
|
+
id: "natural_questions",
|
|
14
14
|
},
|
|
15
15
|
],
|
|
16
16
|
demo: {
|
|
17
17
|
inputs: [
|
|
18
18
|
{
|
|
19
|
-
label:
|
|
19
|
+
label: "Question",
|
|
20
20
|
content: "Which name is also used to describe the Amazon rainforest in English?",
|
|
21
|
-
type:
|
|
21
|
+
type: "text",
|
|
22
22
|
},
|
|
23
23
|
{
|
|
24
|
-
label:
|
|
24
|
+
label: "Context",
|
|
25
25
|
content: "The Amazon rainforest, also known in English as Amazonia or the Amazon Jungle",
|
|
26
|
-
type:
|
|
26
|
+
type: "text",
|
|
27
27
|
},
|
|
28
28
|
],
|
|
29
29
|
outputs: [
|
|
30
30
|
{
|
|
31
|
-
label:
|
|
31
|
+
label: "Answer",
|
|
32
32
|
content: "Amazonia",
|
|
33
|
-
type:
|
|
33
|
+
type: "text",
|
|
34
34
|
},
|
|
35
35
|
],
|
|
36
36
|
},
|
|
37
37
|
metrics: [
|
|
38
38
|
{
|
|
39
|
-
description:
|
|
40
|
-
|
|
39
|
+
description:
|
|
40
|
+
"Exact Match is a metric based on the strict character match of the predicted answer and the right answer. For answers predicted correctly, the Exact Match will be 1. Even if only one character is different, Exact Match will be 0",
|
|
41
|
+
id: "exact-match",
|
|
41
42
|
},
|
|
42
43
|
{
|
|
43
|
-
description:
|
|
44
|
-
|
|
44
|
+
description:
|
|
45
|
+
" The F1-Score metric is useful if we value both false positives and false negatives equally. The F1-Score is calculated on each word in the predicted sequence against the correct answer",
|
|
46
|
+
id: "f1",
|
|
45
47
|
},
|
|
46
48
|
],
|
|
47
49
|
models: [
|
|
48
50
|
{
|
|
49
51
|
description: "A robust baseline model for most question answering domains.",
|
|
50
|
-
id:
|
|
52
|
+
id: "deepset/roberta-base-squad2",
|
|
51
53
|
},
|
|
52
54
|
{
|
|
53
55
|
description: "A special model that can answer questions from tables!",
|
|
54
|
-
id:
|
|
56
|
+
id: "google/tapas-base-finetuned-wtq",
|
|
55
57
|
},
|
|
56
58
|
],
|
|
57
|
-
spaces:
|
|
59
|
+
spaces: [
|
|
58
60
|
{
|
|
59
61
|
description: "An application that can answer a long question from Wikipedia.",
|
|
60
|
-
id:
|
|
62
|
+
id: "deepset/wikipedia-assistant",
|
|
61
63
|
},
|
|
62
64
|
],
|
|
63
|
-
summary:
|
|
65
|
+
summary:
|
|
66
|
+
"Question Answering models can retrieve the answer to a question from a given text, which is useful for searching for an answer in a document. Some question answering models can generate answers without context!",
|
|
64
67
|
widgetModels: ["deepset/roberta-base-squad2"],
|
|
65
|
-
youtubeId:
|
|
68
|
+
youtubeId: "ajPx5LwJD-I",
|
|
66
69
|
};
|
|
67
70
|
|
|
68
|
-
|
|
69
71
|
export default taskData;
|
|
@@ -1,176 +1,167 @@
|
|
|
1
|
-
## Use Cases
|
|
2
|
-
|
|
3
|
-
### Gaming
|
|
4
|
-
|
|
5
|
-
Reinforcement learning is known for its application to video games. Since the games provide a safe environment for the agent to be trained in the sense that it is perfectly defined and controllable, this makes them perfect candidates for experimentation and will help a lot to learn about the capabilities and limitations of various RL algorithms.
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
**
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
#
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
observation
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
#
|
|
98
|
-
|
|
99
|
-
model
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
model
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
from
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
[
|
|
158
|
-
[
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
- [
|
|
165
|
-
- [
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
- [Reinforcement Learning from Human Feedback From Zero to ChatGPT](https://www.youtube.com/watch?v=EAd4oQtEJOM)
|
|
169
|
-
- [Guide on Multi-Agent Competition Systems](https://huggingface.co/blog/aivsai)
|
|
170
|
-
|
|
171
|
-
### Notebooks
|
|
172
|
-
- [Train a Deep Reinforcement Learning lander agent to land correctly on the Moon 🌕 using Stable-Baselines3](https://github.com/huggingface/deep-rl-class/blob/main/notebooks/unit1/unit1.ipynb)
|
|
173
|
-
- [Introduction to Unity MLAgents](https://github.com/huggingface/deep-rl-class/blob/main/notebooks/unit5/unit5.ipynb)
|
|
174
|
-
- [Training Decision Transformers with 🤗 transformers](https://github.com/huggingface/blog/blob/main/notebooks/101_train-decision-transformers.ipynb)
|
|
175
|
-
|
|
176
|
-
This page was made possible thanks to the efforts of [Ram Ananth](https://huggingface.co/RamAnanth1), [Emilio Lehoucq](https://huggingface.co/emiliol), [Sagar Mathpal](https://huggingface.co/sagarmathpal) and [Osman Alenbey](https://huggingface.co/osman93).
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Gaming
|
|
4
|
+
|
|
5
|
+
Reinforcement learning is known for its application to video games. Since the games provide a safe environment for the agent to be trained in the sense that it is perfectly defined and controllable, this makes them perfect candidates for experimentation and will help a lot to learn about the capabilities and limitations of various RL algorithms.
|
|
6
|
+
|
|
7
|
+
There are many videos on the Internet where a game-playing reinforcement learning agent starts with a terrible gaming strategy due to random initialization of its settings, but over iterations, the agent gets better and better with each episode of the training. This [paper](https://arxiv.org/abs/1912.10944) mainly investigates the performance of RL in popular games such as Minecraft or Dota2. The agent's performance can exceed a human player's, although there are still some challenges mainly related to efficiency in constructing the gaming policy of the reinforcement learning agent.
|
|
8
|
+
|
|
9
|
+
### Trading and Finance
|
|
10
|
+
|
|
11
|
+
Reinforcement learning is the science to train computers to make decisions and thus has a novel use in trading and finance. All time-series models are helpful in predicting prices, volume and future sales of a product or a stock. Reinforcement based automated agents can decide to sell, buy or hold a stock. It shifts the impact of AI in this field to real time decision making rather than just prediction of prices. The glossary given below will clear some parameters to as to how we can train a model to take these decisions.
|
|
12
|
+
|
|
13
|
+
## Task Variants
|
|
14
|
+
|
|
15
|
+
### Model Based RL
|
|
16
|
+
|
|
17
|
+
In model based reinforcement learning techniques intend to create a model of the environment, learn the state transition probabilities and the reward function, to find the optimal action. Some typical examples for model based reinforcement learning algorithms are dynamic programming, value iteration and policy iteration.
|
|
18
|
+
|
|
19
|
+
### Model Free RL
|
|
20
|
+
|
|
21
|
+
In model free reinforcement learning, agent decides on optimal actions based on its experience in the environment and the reward it collects from it. This is one of the most commonly used algorithms beneficial in complex environments, where modeling of state transition probabilities and reward functions are difficult. Some of the examples of model free reinforcement learning are SARSA, Q-Learning, actor-critic and proximal policy optimization (PPO) algorithms.
|
|
22
|
+
|
|
23
|
+
## Glossary
|
|
24
|
+
|
|
25
|
+
<!--  TODO: Uncomment image for visual understanding if it fits within the page-->
|
|
26
|
+
|
|
27
|
+
**Agent:** The learner and the decision maker.
|
|
28
|
+
|
|
29
|
+
**Environment:** The part of the world the agent interacts, comprising everything outside the agent.
|
|
30
|
+
|
|
31
|
+
Observations and states are the information our agent gets from the environment. In the case of a video game, it can be a frame (a screenshot). In the case of the trading agent, it can be the value of a certain stock.
|
|
32
|
+
|
|
33
|
+
**State:** Complete description of the state of the environment with no hidden information.
|
|
34
|
+
|
|
35
|
+
**Observation:** Partial description of the state, in a partially observed environment.
|
|
36
|
+
|
|
37
|
+
**Action:** The decision taken by the agent.
|
|
38
|
+
|
|
39
|
+
**Reward:** The numerical feedback signal that the agent receives from the environment based on the chosen action.
|
|
40
|
+
|
|
41
|
+
**Return:** Cumulative Reward. In the simplest case, the return is the sum of the rewards.
|
|
42
|
+
|
|
43
|
+
**Episode:** For some applications there is a natural notion of final time step. In this case, there is a starting point and an ending point (a terminal state). This creates an episode: a list of States, Actions, Rewards, and new States. For instance, think about Chess: an episode begins at the initial board position and ends when the game is over.
|
|
44
|
+
|
|
45
|
+
**Policy:** The Policy is the brain of the Agent, it’s the function that tells what action to take given the state. So it defines the agent’s behavior at a given time. Reinforcement learning methods specify how the agent’s policy is changed as a result of its experience.
|
|
46
|
+
|
|
47
|
+
## Inference
|
|
48
|
+
|
|
49
|
+
Inference in reinforcement learning differs from other modalities, in which there's a model and test data. In reinforcement learning, once you have trained an agent in an environment, you try to run the trained agent for additional steps to get the average reward.
|
|
50
|
+
|
|
51
|
+
A typical training cycle consists of gathering experience from the environment, training the agent, and running the agent on a test environment to obtain average reward. Below there's a snippet on how you can interact with the environment using the `gymnasium` library, train an agent using `stable-baselines3`, evalute the agent on test environment and infer actions from the trained agent.
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
# Here we are running 20 episodes of CartPole-v1 environment, taking random actions
|
|
55
|
+
import gymnasium as gym
|
|
56
|
+
|
|
57
|
+
env = gym.make("CartPole-v1")
|
|
58
|
+
observation, info = env.reset()
|
|
59
|
+
|
|
60
|
+
for _ in range(20):
|
|
61
|
+
action = env.action_space.sample() # samples random action from action sample space
|
|
62
|
+
|
|
63
|
+
# the agent takes the action
|
|
64
|
+
observation, reward, terminated, truncated, info = env.step(action)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# if the agent reaches terminal state, we reset the environment
|
|
68
|
+
if terminated or truncated:
|
|
69
|
+
|
|
70
|
+
print("Environment is reset")
|
|
71
|
+
observation = env.reset()
|
|
72
|
+
|
|
73
|
+
env.close()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Below snippet shows how to train a PPO model on LunarLander-v2 environment using `stable-baselines3` library and saving the model
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from stable_baselines3 import PPO
|
|
80
|
+
|
|
81
|
+
# initialize the environment
|
|
82
|
+
|
|
83
|
+
env = gym.make("LunarLander-v2")
|
|
84
|
+
|
|
85
|
+
# initialize the model
|
|
86
|
+
|
|
87
|
+
model = PPO(policy = "MlpPolicy",
|
|
88
|
+
env = env,
|
|
89
|
+
n_steps = 1024,
|
|
90
|
+
batch_size = 64,
|
|
91
|
+
n_epochs = 4,
|
|
92
|
+
verbose = 1)
|
|
93
|
+
|
|
94
|
+
# train the model for 1000 time steps
|
|
95
|
+
model.learn(total_timesteps = 1000)
|
|
96
|
+
|
|
97
|
+
# Saving the model in desired directory
|
|
98
|
+
model_name = "PPO-LunarLander-v2"
|
|
99
|
+
model.save(model_name)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Below code shows how to evaluate an agent trained using `stable-baselines3`
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
# Loading a saved model and evaluating the model for 10 episodes
|
|
106
|
+
from stable_baselines3.common.evaluation import evaluate_policy
|
|
107
|
+
from stable_baselines3 import PPO
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
env = gym.make("LunarLander-v2")
|
|
111
|
+
# Loading the saved model
|
|
112
|
+
model = PPO.load("PPO-LunarLander-v2",env=env)
|
|
113
|
+
|
|
114
|
+
# Initializating the evaluation environment
|
|
115
|
+
eval_env = gym.make("LunarLander-v2")
|
|
116
|
+
|
|
117
|
+
# Running the trained agent on eval_env for 10 time steps and getting the mean reward
|
|
118
|
+
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes = 10,
|
|
119
|
+
deterministic=True)
|
|
120
|
+
|
|
121
|
+
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Below code snippet shows how to infer actions from an agent trained using `stable-baselines3`
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from stable_baselines3.common.evaluation import evaluate_policy
|
|
128
|
+
from stable_baselines3 import PPO
|
|
129
|
+
|
|
130
|
+
# Loading the saved model
|
|
131
|
+
model = PPO.load("PPO-LunarLander-v2",env=env)
|
|
132
|
+
|
|
133
|
+
# Getting the environment from the trained agent
|
|
134
|
+
env = model.get_env()
|
|
135
|
+
|
|
136
|
+
obs = env.reset()
|
|
137
|
+
for i in range(1000):
|
|
138
|
+
# getting action predictions from the trained agent
|
|
139
|
+
action, _states = model.predict(obs, deterministic=True)
|
|
140
|
+
|
|
141
|
+
# taking the predicted action in the environment to observe next state and rewards
|
|
142
|
+
obs, rewards, dones, info = env.step(action)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
For more information, you can check out the documentations of the respective libraries.
|
|
146
|
+
|
|
147
|
+
[Gymnasium Documentation](https://gymnasium.farama.org/)
|
|
148
|
+
[Stable Baselines Documentation](https://stable-baselines3.readthedocs.io/en/master/)
|
|
149
|
+
|
|
150
|
+
## Useful Resources
|
|
151
|
+
|
|
152
|
+
Would you like to learn more about the topic? Awesome! Here you can find some curated resources that you may find helpful!
|
|
153
|
+
|
|
154
|
+
- [HuggingFace Deep Reinforcement Learning Class](https://github.com/huggingface/deep-rl-class)
|
|
155
|
+
- [Introduction to Deep Reinforcement Learning](https://huggingface.co/blog/deep-rl-intro)
|
|
156
|
+
- [Stable Baselines Integration with HuggingFace](https://huggingface.co/blog/sb3)
|
|
157
|
+
- Learn how reinforcement learning is used in conversational agents in this blog: [Illustrating Reinforcement Learning from Human Feedback (RLHF)](https://huggingface.co/blog/rlhf)
|
|
158
|
+
- [Reinforcement Learning from Human Feedback From Zero to ChatGPT](https://www.youtube.com/watch?v=EAd4oQtEJOM)
|
|
159
|
+
- [Guide on Multi-Agent Competition Systems](https://huggingface.co/blog/aivsai)
|
|
160
|
+
|
|
161
|
+
### Notebooks
|
|
162
|
+
|
|
163
|
+
- [Train a Deep Reinforcement Learning lander agent to land correctly on the Moon 🌕 using Stable-Baselines3](https://github.com/huggingface/deep-rl-class/blob/main/notebooks/unit1/unit1.ipynb)
|
|
164
|
+
- [Introduction to Unity MLAgents](https://github.com/huggingface/deep-rl-class/blob/main/notebooks/unit5/unit5.ipynb)
|
|
165
|
+
- [Training Decision Transformers with 🤗 transformers](https://github.com/huggingface/blog/blob/main/notebooks/101_train-decision-transformers.ipynb)
|
|
166
|
+
|
|
167
|
+
This page was made possible thanks to the efforts of [Ram Ananth](https://huggingface.co/RamAnanth1), [Emilio Lehoucq](https://huggingface.co/emiliol), [Sagar Mathpal](https://huggingface.co/sagarmathpal) and [Osman Alenbey](https://huggingface.co/osman93).
|
|
@@ -1,78 +1,75 @@
|
|
|
1
|
-
import type { TaskDataCustom } from "../Types";
|
|
2
|
-
|
|
3
|
-
const taskData: TaskDataCustom = {
|
|
4
|
-
datasets: [
|
|
5
|
-
{
|
|
6
|
-
description:
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
],
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
};
|
|
77
|
-
|
|
78
|
-
export default taskData;
|
|
1
|
+
import type { TaskDataCustom } from "../Types";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A curation of widely used datasets for Data Driven Deep Reinforcement Learning (D4RL)",
|
|
7
|
+
id: "edbeeching/decision_transformer_gym_replay",
|
|
8
|
+
},
|
|
9
|
+
],
|
|
10
|
+
demo: {
|
|
11
|
+
inputs: [
|
|
12
|
+
{
|
|
13
|
+
label: "State",
|
|
14
|
+
content: "Red traffic light, pedestrians are about to pass.",
|
|
15
|
+
type: "text",
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
outputs: [
|
|
19
|
+
{
|
|
20
|
+
label: "Action",
|
|
21
|
+
content: "Stop the car.",
|
|
22
|
+
type: "text",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
label: "Next State",
|
|
26
|
+
content: "Yellow light, pedestrians have crossed.",
|
|
27
|
+
type: "text",
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
},
|
|
31
|
+
metrics: [
|
|
32
|
+
{
|
|
33
|
+
description:
|
|
34
|
+
"Accumulated reward across all time steps discounted by a factor that ranges between 0 and 1 and determines how much the agent optimizes for future relative to immediate rewards. Measures how good is the policy ultimately found by a given algorithm considering uncertainty over the future.",
|
|
35
|
+
id: "Discounted Total Reward",
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
description:
|
|
39
|
+
"Average return obtained after running the policy for a certain number of evaluation episodes. As opposed to total reward, mean reward considers how much reward a given algorithm receives while learning.",
|
|
40
|
+
id: "Mean Reward",
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
description:
|
|
44
|
+
"Measures how good a given algorithm is after a predefined time. Some algorithms may be guaranteed to converge to optimal behavior across many time steps. However, an agent that reaches an acceptable level of optimality after a given time horizon may be preferable to one that ultimately reaches optimality but takes a long time.",
|
|
45
|
+
id: "Level of Performance After Some Time",
|
|
46
|
+
},
|
|
47
|
+
],
|
|
48
|
+
models: [
|
|
49
|
+
{
|
|
50
|
+
description: "A Reinforcement Learning model trained on expert data from the Gym Hopper environment",
|
|
51
|
+
|
|
52
|
+
id: "edbeeching/decision-transformer-gym-hopper-expert",
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
description: "A PPO agent playing seals/CartPole-v0 using the stable-baselines3 library and the RL Zoo.",
|
|
56
|
+
id: "HumanCompatibleAI/ppo-seals-CartPole-v0",
|
|
57
|
+
},
|
|
58
|
+
],
|
|
59
|
+
spaces: [
|
|
60
|
+
{
|
|
61
|
+
description: "An application for a cute puppy agent learning to catch a stick.",
|
|
62
|
+
id: "ThomasSimonini/Huggy",
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
description: "An application to play Snowball Fight with a reinforcement learning agent.",
|
|
66
|
+
id: "ThomasSimonini/SnowballFight",
|
|
67
|
+
},
|
|
68
|
+
],
|
|
69
|
+
summary:
|
|
70
|
+
"Reinforcement learning is the computational approach of learning from action by interacting with an environment through trial and error and receiving rewards (negative or positive) as feedback",
|
|
71
|
+
widgetModels: [],
|
|
72
|
+
youtubeId: "q0BiUn5LiBc",
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
export default taskData;
|