tumblrbot 1.8.0__tar.gz → 1.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/.gitignore +23 -11
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/PKG-INFO +31 -22
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/README.md +30 -21
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/pyproject.toml +1 -1
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/flow/examples.py +12 -7
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/flow/generate.py +6 -2
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/utils/models.py +24 -9
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/.github/FUNDING.yml +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/.github/dependabot.yml +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/UNLICENSE +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/sample_custom_prompts.jsonl +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/__init__.py +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/__main__.py +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/flow/__init__.py +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/flow/download.py +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/flow/fine_tune.py +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/utils/__init__.py +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/utils/common.py +0 -0
- {tumblrbot-1.8.0 → tumblrbot-1.9.1}/src/tumblrbot/utils/tumblr.py +0 -0
|
@@ -1,10 +1,3 @@
|
|
|
1
|
-
# Custom
|
|
2
|
-
data
|
|
3
|
-
*.lnk
|
|
4
|
-
config.toml
|
|
5
|
-
custom_prompts.jsonl
|
|
6
|
-
examples.jsonl
|
|
7
|
-
|
|
8
1
|
# Byte-compiled / optimized / DLL files
|
|
9
2
|
__pycache__/
|
|
10
3
|
*.py[codz]
|
|
@@ -138,6 +131,19 @@ __pypackages__/
|
|
|
138
131
|
celerybeat-schedule
|
|
139
132
|
celerybeat.pid
|
|
140
133
|
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
141
147
|
# SageMath parsed files
|
|
142
148
|
*.sage.py
|
|
143
149
|
|
|
@@ -189,11 +195,11 @@ cython_debug/
|
|
|
189
195
|
.abstra/
|
|
190
196
|
|
|
191
197
|
# Visual Studio Code
|
|
192
|
-
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
193
199
|
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
194
|
-
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
195
201
|
# you could uncomment the following to ignore the entire vscode folder
|
|
196
|
-
.vscode/
|
|
202
|
+
# .vscode/
|
|
197
203
|
|
|
198
204
|
# Ruff stuff:
|
|
199
205
|
.ruff_cache/
|
|
@@ -207,4 +213,10 @@ marimo/_lsp/
|
|
|
207
213
|
__marimo__/
|
|
208
214
|
|
|
209
215
|
# Streamlit
|
|
210
|
-
.streamlit/secrets.toml
|
|
216
|
+
.streamlit/secrets.toml
|
|
217
|
+
|
|
218
|
+
.vscode
|
|
219
|
+
data
|
|
220
|
+
*.jsonl
|
|
221
|
+
config.toml
|
|
222
|
+
tumblrbot.exe.lnk
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tumblrbot
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.1
|
|
4
4
|
Summary: An updated bot that posts to Tumblr, based on your very own blog!
|
|
5
5
|
Requires-Python: >= 3.13
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -39,13 +39,15 @@ Project-URL: Source, https://github.com/MaidScientistIzutsumiMarin/tumblrbot
|
|
|
39
39
|
[Tumblr API Documentation on Blog Identifiers]: https://tumblr.com/docs/en/api/v2#blog-identifiers
|
|
40
40
|
[Tumblr API Documentation on Rate Limits]: https://tumblr.com/docs/en/api/v2#rate-limits
|
|
41
41
|
|
|
42
|
+
[Format String]: https://docs.python.org/3/library/string.html#format-string-syntax
|
|
43
|
+
|
|
42
44
|
[Download]: src/tumblrbot/flow/download.py
|
|
43
45
|
[Examples]: src/tumblrbot/flow/examples.py
|
|
44
46
|
[Fine-Tune]: src/tumblrbot/flow/fine_tune.py
|
|
45
47
|
[Generate]: src/tumblrbot/flow/generate.py
|
|
46
48
|
[Main]: src/tumblrbot/__main__.py
|
|
47
49
|
|
|
48
|
-
[
|
|
50
|
+
[Configurable]: #configuration
|
|
49
51
|
[Fine-Tuning]: #manual-fine-tuning
|
|
50
52
|
[](https://python.org/pypi/tumblrbot)
|
|
51
53
|
|
|
@@ -60,36 +62,34 @@ Features:
|
|
|
60
62
|
1. Asks for [OpenAI] and [Tumblr] tokens.
|
|
61
63
|
- Stores API tokens using [keyring].
|
|
62
64
|
1. Retrieves [Tumblr] [OAuth] tokens.
|
|
63
|
-
1. [Downloads posts][Download] from
|
|
65
|
+
1. [Downloads posts][Download] from specified blogs ([configurable]).
|
|
64
66
|
- Skips redownloading already downloaded posts.
|
|
65
67
|
- Shows progress and previews the current post.
|
|
66
|
-
1. [Creates examples][Examples] to fine-tune the model from
|
|
68
|
+
1. [Creates examples][Examples] to fine-tune the model from the downloaded posts.
|
|
67
69
|
- Filters out posts that contain more than just text data.
|
|
68
|
-
- Filters out posts that contain
|
|
69
|
-
-
|
|
70
|
+
- Filters out posts that contain regular expressions ([configurable]).
|
|
71
|
+
- Only uses the most recent posts from each blog ([configurable]).
|
|
72
|
+
- Adds custom user messages and assistant responses to the dataset ([configurable]).
|
|
70
73
|
1. Filters out any posts flagged by the [OpenAI Moderation API].
|
|
71
74
|
1. [Uploads examples][Fine-Tune] to [OpenAI] and begins the fine-tuning process.
|
|
72
|
-
- Provides cost estimates if the currently saved examples are used to fine-tune
|
|
75
|
+
- Provides cost estimates if the currently saved examples are used to fine-tune a base model ([configurable]).
|
|
73
76
|
- Resumes monitoring the same fine-tuning process when restarted.
|
|
74
77
|
- Deletes the uploaded examples file if fine-tuning does not succeed (optional).
|
|
75
78
|
- Stores the output model automatically when fine-tuning is completed.
|
|
76
|
-
1. [Generates and uploads posts][Generate] to
|
|
77
|
-
- Creates tags by extracting keywords
|
|
78
|
-
- Uploads posts as drafts
|
|
79
|
-
- Reblogs posts from
|
|
79
|
+
1. [Generates and uploads posts][Generate] to a blog using the fine-tuned model ([configurable]).
|
|
80
|
+
- Creates tags by extracting keywords using the base model ([configurable]).
|
|
81
|
+
- Uploads posts as drafts.
|
|
82
|
+
- Reblogs posts from allowed blogs ([configurable]).
|
|
80
83
|
- Shows progress and previews the current post.
|
|
81
84
|
- Colorful output, progress bars, and post previews using [rich].
|
|
82
|
-
- Automatically keeps the [config] file up-to-date and recreates it if missing.
|
|
83
|
-
|
|
84
|
-
**To-Do:**
|
|
85
|
-
|
|
86
|
-
- Create training data from a sample of posts (possible).
|
|
85
|
+
- Automatically keeps the [config][configurable] file up-to-date and recreates it if missing (without overriding user settings).
|
|
87
86
|
|
|
88
87
|
**Known Issues:**
|
|
89
88
|
|
|
90
89
|
- Sometimes, you will get an error about the training file not being found when starting fine-tuning. We do not currently have a fix or workaround for this. You should instead use the online portal for fine-tuning if this continues to happen. Read more in [fine-tuning].
|
|
91
90
|
- Post counts are incorrect when downloading posts. We are not certain what the cause of this is, but our tests suggest this is a [Tumblr] API problem that is giving inaccurate numbers.
|
|
92
91
|
- During post downloading or post generation, you may receive a "Limit Exceeded" error message from the [Tumblr] API. This is caused by server-side rate-limiting by [Tumblr]. The only workaround is trying again or waiting for a period of time before retrying. In most cases, you either have to wait for a minute or an hour for the limits to reset. You can read more about the limits in the [Tumblr API documentation on rate limits].
|
|
92
|
+
- Similar to the above issue, you may sometimes get a message saying your IP is blocked. This block is temporary and probably follows the same rules as previously described.
|
|
93
93
|
|
|
94
94
|
**Please submit an issue or contact us for features you want added/reimplemented.**
|
|
95
95
|
|
|
@@ -106,7 +106,7 @@ Features:
|
|
|
106
106
|
|
|
107
107
|
## Usage
|
|
108
108
|
|
|
109
|
-
Run `tumblrbot` from anywhere. Run `tumblrbot --help` for command-line options. Every command-line option corresponds to a value from the [config].
|
|
109
|
+
Run `tumblrbot` from anywhere. Run `tumblrbot --help` for command-line options. Every command-line option corresponds to a value from the [config][configurable].
|
|
110
110
|
|
|
111
111
|
## Obtaining Tokens
|
|
112
112
|
|
|
@@ -143,6 +143,13 @@ All file options can include directories that will be created when the program i
|
|
|
143
143
|
|
|
144
144
|
All config options that involve *blog identifiers* expect any version of a blog URL, which is explained in more detail in the [Tumblr API documentation on blog identifiers].
|
|
145
145
|
|
|
146
|
+
A valid post:
|
|
147
|
+
|
|
148
|
+
- Contains any content.
|
|
149
|
+
- Only has text.
|
|
150
|
+
- Is not an ask.
|
|
151
|
+
- Is not a reblog.
|
|
152
|
+
|
|
146
153
|
Specific Options:
|
|
147
154
|
|
|
148
155
|
- `custom_prompts_file` This file should follow the following file format:
|
|
@@ -155,6 +162,7 @@ Specific Options:
|
|
|
155
162
|
|
|
156
163
|
To be specific, it should follow the [JSON Lines] file format with one collection of name/value pairs (a dictionary) per line. You can validate your file using the [JSON Lines Validator].
|
|
157
164
|
|
|
165
|
+
- **`post_limit`** - At most, this many valid posts will be included in the training data. This effectively is a filter to select the `N` most recent valid posts from each blog. `0` will use every available valid post.
|
|
158
166
|
- **`filtered_words`** - During training data generation, any posts with the specified words will be removed. Word boundaries are not checked by default, so "the" will also filter out posts with "them" or "thematic". This setting supports regular expressions, so you can explicitly look for word boundaries by surrounding an entry with "\\\b", i.e. "\\\bthe\\\b". Regular expressions have to be escaped like so due to how JSON data is read in. If you are familiar with regular expressions, it could be useful for you to know that every entry is joined with a "|" which is then used to search the post content for any matches.
|
|
159
167
|
- **`developer_message`** - This message is used in for fine-tuning the AI as well as generating prompts. If you change this, you will need to run the fine-tuning again with the new value before generating posts.
|
|
160
168
|
- **`user_message`** - This setting is used and works in the same way as `developer_message`.
|
|
@@ -166,16 +174,17 @@ Specific Options:
|
|
|
166
174
|
- **`tags_chance`** - This should be between 0 and 1. Setting it to 0 corresponds to a 0% chance (never) to add tags to a post. 1 corresponds to a 100% chance (always) to add tags to a post. Adding tags incurs a very small token cost.
|
|
167
175
|
- **`reblog_blog_identifiers`** - Whenever a reblog is attempted, a random blog from this list will be chosen to be reblogged from.
|
|
168
176
|
- **`reblog_chance`** - This setting works the same way as `tags_chance`.
|
|
169
|
-
- **`reblog_user_message`** - This setting is a
|
|
177
|
+
- **`reblog_user_message`** - This setting is a [format string]. The only argument it is formatted with is the content of the post being reblogged. In simple terms, the `{}` will be replaced with said content.
|
|
178
|
+
- *Note: The bot is only given the latest message in a reblog chain due to the required complexity and added costs of including the entire chain.*
|
|
170
179
|
|
|
171
180
|
## Manual Fine-Tuning
|
|
172
181
|
|
|
173
182
|
You can manually upload the examples file to [OpenAI] and start the fine-tuning here: [fine-tuning portal].
|
|
174
183
|
|
|
175
184
|
1. Press `+ Create`.
|
|
176
|
-
1. Select the desired `Base Model` from the dropdown. This should ideally match the model set in the [config].
|
|
177
|
-
1. Upload the generated examples file to the section under `Training data`. You can find the path for this in the [config].
|
|
185
|
+
1. Select the desired `Base Model` from the dropdown. This should ideally match the model set in the [config][configurable].
|
|
186
|
+
1. Upload the generated examples file to the section under `Training data`. You can find the path for this in the [config][configurable].
|
|
178
187
|
1. Press `Create`.
|
|
179
|
-
1. (Optional) Copy the value next to `Job ID` and paste it into the [config] under `job_id`. You can then run the program and monitor its progress as usual.
|
|
180
|
-
1. If you do not do the above, you will have to copy the value next to `Output model` once the job is complete and paste it into the [config] under `fine_tuned_model`.
|
|
188
|
+
1. (Optional) Copy the value next to `Job ID` and paste it into the [config][configurable] under `job_id`. You can then run the program and monitor its progress as usual.
|
|
189
|
+
1. If you do not do the above, you will have to copy the value next to `Output model` once the job is complete and paste it into the [config][configurable] under `fine_tuned_model`.
|
|
181
190
|
|
|
@@ -21,13 +21,15 @@
|
|
|
21
21
|
[Tumblr API Documentation on Blog Identifiers]: https://tumblr.com/docs/en/api/v2#blog-identifiers
|
|
22
22
|
[Tumblr API Documentation on Rate Limits]: https://tumblr.com/docs/en/api/v2#rate-limits
|
|
23
23
|
|
|
24
|
+
[Format String]: https://docs.python.org/3/library/string.html#format-string-syntax
|
|
25
|
+
|
|
24
26
|
[Download]: src/tumblrbot/flow/download.py
|
|
25
27
|
[Examples]: src/tumblrbot/flow/examples.py
|
|
26
28
|
[Fine-Tune]: src/tumblrbot/flow/fine_tune.py
|
|
27
29
|
[Generate]: src/tumblrbot/flow/generate.py
|
|
28
30
|
[Main]: src/tumblrbot/__main__.py
|
|
29
31
|
|
|
30
|
-
[
|
|
32
|
+
[Configurable]: #configuration
|
|
31
33
|
[Fine-Tuning]: #manual-fine-tuning
|
|
32
34
|
[](https://python.org/pypi/tumblrbot)
|
|
33
35
|
|
|
@@ -42,36 +44,34 @@ Features:
|
|
|
42
44
|
1. Asks for [OpenAI] and [Tumblr] tokens.
|
|
43
45
|
- Stores API tokens using [keyring].
|
|
44
46
|
1. Retrieves [Tumblr] [OAuth] tokens.
|
|
45
|
-
1. [Downloads posts][Download] from
|
|
47
|
+
1. [Downloads posts][Download] from specified blogs ([configurable]).
|
|
46
48
|
- Skips redownloading already downloaded posts.
|
|
47
49
|
- Shows progress and previews the current post.
|
|
48
|
-
1. [Creates examples][Examples] to fine-tune the model from
|
|
50
|
+
1. [Creates examples][Examples] to fine-tune the model from the downloaded posts.
|
|
49
51
|
- Filters out posts that contain more than just text data.
|
|
50
|
-
- Filters out posts that contain
|
|
51
|
-
-
|
|
52
|
+
- Filters out posts that contain regular expressions ([configurable]).
|
|
53
|
+
- Only uses the most recent posts from each blog ([configurable]).
|
|
54
|
+
- Adds custom user messages and assistant responses to the dataset ([configurable]).
|
|
52
55
|
1. Filters out any posts flagged by the [OpenAI Moderation API].
|
|
53
56
|
1. [Uploads examples][Fine-Tune] to [OpenAI] and begins the fine-tuning process.
|
|
54
|
-
- Provides cost estimates if the currently saved examples are used to fine-tune
|
|
57
|
+
- Provides cost estimates if the currently saved examples are used to fine-tune a base model ([configurable]).
|
|
55
58
|
- Resumes monitoring the same fine-tuning process when restarted.
|
|
56
59
|
- Deletes the uploaded examples file if fine-tuning does not succeed (optional).
|
|
57
60
|
- Stores the output model automatically when fine-tuning is completed.
|
|
58
|
-
1. [Generates and uploads posts][Generate] to
|
|
59
|
-
- Creates tags by extracting keywords
|
|
60
|
-
- Uploads posts as drafts
|
|
61
|
-
- Reblogs posts from
|
|
61
|
+
1. [Generates and uploads posts][Generate] to a blog using the fine-tuned model ([configurable]).
|
|
62
|
+
- Creates tags by extracting keywords using the base model ([configurable]).
|
|
63
|
+
- Uploads posts as drafts.
|
|
64
|
+
- Reblogs posts from allowed blogs ([configurable]).
|
|
62
65
|
- Shows progress and previews the current post.
|
|
63
66
|
- Colorful output, progress bars, and post previews using [rich].
|
|
64
|
-
- Automatically keeps the [config] file up-to-date and recreates it if missing.
|
|
65
|
-
|
|
66
|
-
**To-Do:**
|
|
67
|
-
|
|
68
|
-
- Create training data from a sample of posts (possible).
|
|
67
|
+
- Automatically keeps the [config][configurable] file up-to-date and recreates it if missing (without overriding user settings).
|
|
69
68
|
|
|
70
69
|
**Known Issues:**
|
|
71
70
|
|
|
72
71
|
- Sometimes, you will get an error about the training file not being found when starting fine-tuning. We do not currently have a fix or workaround for this. You should instead use the online portal for fine-tuning if this continues to happen. Read more in [fine-tuning].
|
|
73
72
|
- Post counts are incorrect when downloading posts. We are not certain what the cause of this is, but our tests suggest this is a [Tumblr] API problem that is giving inaccurate numbers.
|
|
74
73
|
- During post downloading or post generation, you may receive a "Limit Exceeded" error message from the [Tumblr] API. This is caused by server-side rate-limiting by [Tumblr]. The only workaround is trying again or waiting for a period of time before retrying. In most cases, you either have to wait for a minute or an hour for the limits to reset. You can read more about the limits in the [Tumblr API documentation on rate limits].
|
|
74
|
+
- Similar to the above issue, you may sometimes get a message saying your IP is blocked. This block is temporary and probably follows the same rules as previously described.
|
|
75
75
|
|
|
76
76
|
**Please submit an issue or contact us for features you want added/reimplemented.**
|
|
77
77
|
|
|
@@ -88,7 +88,7 @@ Features:
|
|
|
88
88
|
|
|
89
89
|
## Usage
|
|
90
90
|
|
|
91
|
-
Run `tumblrbot` from anywhere. Run `tumblrbot --help` for command-line options. Every command-line option corresponds to a value from the [config].
|
|
91
|
+
Run `tumblrbot` from anywhere. Run `tumblrbot --help` for command-line options. Every command-line option corresponds to a value from the [config][configurable].
|
|
92
92
|
|
|
93
93
|
## Obtaining Tokens
|
|
94
94
|
|
|
@@ -125,6 +125,13 @@ All file options can include directories that will be created when the program i
|
|
|
125
125
|
|
|
126
126
|
All config options that involve *blog identifiers* expect any version of a blog URL, which is explained in more detail in the [Tumblr API documentation on blog identifiers].
|
|
127
127
|
|
|
128
|
+
A valid post:
|
|
129
|
+
|
|
130
|
+
- Contains any content.
|
|
131
|
+
- Only has text.
|
|
132
|
+
- Is not an ask.
|
|
133
|
+
- Is not a reblog.
|
|
134
|
+
|
|
128
135
|
Specific Options:
|
|
129
136
|
|
|
130
137
|
- `custom_prompts_file` This file should follow the following file format:
|
|
@@ -137,6 +144,7 @@ Specific Options:
|
|
|
137
144
|
|
|
138
145
|
To be specific, it should follow the [JSON Lines] file format with one collection of name/value pairs (a dictionary) per line. You can validate your file using the [JSON Lines Validator].
|
|
139
146
|
|
|
147
|
+
- **`post_limit`** - At most, this many valid posts will be included in the training data. This effectively is a filter to select the `N` most recent valid posts from each blog. `0` will use every available valid post.
|
|
140
148
|
- **`filtered_words`** - During training data generation, any posts with the specified words will be removed. Word boundaries are not checked by default, so "the" will also filter out posts with "them" or "thematic". This setting supports regular expressions, so you can explicitly look for word boundaries by surrounding an entry with "\\\b", i.e. "\\\bthe\\\b". Regular expressions have to be escaped like so due to how JSON data is read in. If you are familiar with regular expressions, it could be useful for you to know that every entry is joined with a "|" which is then used to search the post content for any matches.
|
|
141
149
|
- **`developer_message`** - This message is used in for fine-tuning the AI as well as generating prompts. If you change this, you will need to run the fine-tuning again with the new value before generating posts.
|
|
142
150
|
- **`user_message`** - This setting is used and works in the same way as `developer_message`.
|
|
@@ -148,15 +156,16 @@ Specific Options:
|
|
|
148
156
|
- **`tags_chance`** - This should be between 0 and 1. Setting it to 0 corresponds to a 0% chance (never) to add tags to a post. 1 corresponds to a 100% chance (always) to add tags to a post. Adding tags incurs a very small token cost.
|
|
149
157
|
- **`reblog_blog_identifiers`** - Whenever a reblog is attempted, a random blog from this list will be chosen to be reblogged from.
|
|
150
158
|
- **`reblog_chance`** - This setting works the same way as `tags_chance`.
|
|
151
|
-
- **`reblog_user_message`** - This setting is a
|
|
159
|
+
- **`reblog_user_message`** - This setting is a [format string]. The only argument it is formatted with is the content of the post being reblogged. In simple terms, the `{}` will be replaced with said content.
|
|
160
|
+
- *Note: The bot is only given the latest message in a reblog chain due to the required complexity and added costs of including the entire chain.*
|
|
152
161
|
|
|
153
162
|
## Manual Fine-Tuning
|
|
154
163
|
|
|
155
164
|
You can manually upload the examples file to [OpenAI] and start the fine-tuning here: [fine-tuning portal].
|
|
156
165
|
|
|
157
166
|
1. Press `+ Create`.
|
|
158
|
-
1. Select the desired `Base Model` from the dropdown. This should ideally match the model set in the [config].
|
|
159
|
-
1. Upload the generated examples file to the section under `Training data`. You can find the path for this in the [config].
|
|
167
|
+
1. Select the desired `Base Model` from the dropdown. This should ideally match the model set in the [config][configurable].
|
|
168
|
+
1. Upload the generated examples file to the section under `Training data`. You can find the path for this in the [config][configurable].
|
|
160
169
|
1. Press `Create`.
|
|
161
|
-
1. (Optional) Copy the value next to `Job ID` and paste it into the [config] under `job_id`. You can then run the program and monitor its progress as usual.
|
|
162
|
-
1. If you do not do the above, you will have to copy the value next to `Output model` once the job is complete and paste it into the [config] under `fine_tuned_model`.
|
|
170
|
+
1. (Optional) Copy the value next to `Job ID` and paste it into the [config][configurable] under `job_id`. You can then run the program and monitor its progress as usual.
|
|
171
|
+
1. If you do not do the above, you will have to copy the value next to `Output model` once the job is complete and paste it into the [config][configurable] under `fine_tuned_model`.
|
|
@@ -3,6 +3,7 @@ from collections.abc import Generator
|
|
|
3
3
|
from itertools import batched
|
|
4
4
|
from json import loads
|
|
5
5
|
from math import ceil
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
from re import search
|
|
7
8
|
from typing import IO, override
|
|
8
9
|
|
|
@@ -29,7 +30,7 @@ class ExamplesWriter(FlowClass):
|
|
|
29
30
|
for post in self.get_valid_posts():
|
|
30
31
|
self.write_example(
|
|
31
32
|
self.config.user_message,
|
|
32
|
-
post
|
|
33
|
+
str(post),
|
|
33
34
|
fp,
|
|
34
35
|
)
|
|
35
36
|
|
|
@@ -55,13 +56,17 @@ class ExamplesWriter(FlowClass):
|
|
|
55
56
|
yield from data.items()
|
|
56
57
|
|
|
57
58
|
def get_valid_posts(self) -> Generator[Post]:
|
|
59
|
+
for path in self.get_data_paths():
|
|
60
|
+
posts = list(self.get_valid_posts_from_path(path))
|
|
61
|
+
yield from posts[-self.config.post_limit :]
|
|
62
|
+
|
|
63
|
+
def get_valid_posts_from_path(self, path: Path) -> Generator[Post]:
|
|
58
64
|
pattern = re.compile("|".join(self.config.filtered_words), re.IGNORECASE)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
yield post
|
|
65
|
+
with path.open("rb") as fp:
|
|
66
|
+
for line in fp:
|
|
67
|
+
post = Post.model_validate_json(line)
|
|
68
|
+
if post.valid_text_post() and not (post.trail and self.config.filtered_words and pattern.search(str(post))):
|
|
69
|
+
yield post
|
|
65
70
|
|
|
66
71
|
def filter_examples(self) -> None:
|
|
67
72
|
examples = self.config.examples_file.read_text("utf_8").splitlines()
|
|
@@ -34,7 +34,7 @@ class DraftGenerator(FlowClass):
|
|
|
34
34
|
|
|
35
35
|
def generate_post(self) -> Post:
|
|
36
36
|
if original := self.get_random_post():
|
|
37
|
-
user_message =
|
|
37
|
+
user_message = self.config.reblog_user_message.format(original)
|
|
38
38
|
else:
|
|
39
39
|
original = Post()
|
|
40
40
|
user_message = self.config.user_message
|
|
@@ -79,7 +79,7 @@ class DraftGenerator(FlowClass):
|
|
|
79
79
|
offset,
|
|
80
80
|
).response.posts:
|
|
81
81
|
post = Post.model_validate(raw_post)
|
|
82
|
-
if post.valid_text_post():
|
|
82
|
+
if post.valid_text_post() and self.is_trail_valid(post.trail):
|
|
83
83
|
return post
|
|
84
84
|
|
|
85
85
|
return None
|
|
@@ -89,3 +89,7 @@ class DraftGenerator(FlowClass):
|
|
|
89
89
|
total = self.tumblr.retrieve_blog_info(blog_identifier).response.blog.posts
|
|
90
90
|
# The same Iterable object is cached, so reading an element will effectively discard it. This prevents checking the same offsets twice.
|
|
91
91
|
return iter(sample(range(total), total))
|
|
92
|
+
|
|
93
|
+
def is_trail_valid(self, trail: list[Post]) -> bool:
|
|
94
|
+
# Checks if every post in the reblog trail is valid and that the blog that created the post is in the allowed reblog list.
|
|
95
|
+
return all(post.valid_text_post() and post.blog.name in self.config.reblog_blog_identifiers for post in trail)
|
|
@@ -9,7 +9,7 @@ import tomlkit
|
|
|
9
9
|
from keyring import get_password, set_password
|
|
10
10
|
from openai.types import ChatModel
|
|
11
11
|
from pwinput import pwinput
|
|
12
|
-
from pydantic import BaseModel, ConfigDict, Field, NonNegativeFloat, PlainSerializer, PositiveFloat, PositiveInt, model_validator
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field, NonNegativeFloat, NonNegativeInt, PlainSerializer, PositiveFloat, PositiveInt, model_validator
|
|
13
13
|
from pydantic.json_schema import SkipJsonSchema
|
|
14
14
|
from requests_oauthlib import OAuth1Session
|
|
15
15
|
from rich.panel import Panel
|
|
@@ -50,7 +50,8 @@ class Config(FileSyncSettings):
|
|
|
50
50
|
data_directory: Path = Field(Path("data"), description="Where to store downloaded post data.")
|
|
51
51
|
|
|
52
52
|
# Writing Examples
|
|
53
|
-
|
|
53
|
+
post_limit: NonNegativeInt = Field(0, description="The number of the most recent posts from each blog that should be included in the training data.")
|
|
54
|
+
max_moderation_batch_size: PositiveInt = Field(100, description="The number of posts, at most, to submit to the OpenAI moderation API. This is also capped by the API.")
|
|
54
55
|
custom_prompts_file: Path = Field(Path("custom_prompts.jsonl"), description="Where to read in custom prompts from.")
|
|
55
56
|
filtered_words: list[str] = Field([], description="A case-insensitive list of disallowed words used to filter out training data. Regular expressions are allowed, but must be escaped.")
|
|
56
57
|
|
|
@@ -77,7 +78,7 @@ class Config(FileSyncSettings):
|
|
|
77
78
|
tags_developer_message: str = Field("You will be provided with a block of text, and your task is to extract a very short list of the most important subjects from it.", description="The developer message used to generate tags.")
|
|
78
79
|
reblog_blog_identifiers: list[str] = Field([], description="The identifiers of blogs that can be reblogged from when generating drafts.")
|
|
79
80
|
reblog_chance: NonNegativeFloat = Field(0.1, description="The chance to generate a reblog of a random post. This will use more OpenAI tokens.")
|
|
80
|
-
reblog_user_message: str = Field("Please write a comical Tumblr post in response to the following post
|
|
81
|
+
reblog_user_message: str = Field("Please write a comical Tumblr post in response to the following post:\n{}", description="The format string for the user message used to reblog posts.")
|
|
81
82
|
|
|
82
83
|
@classmethod
|
|
83
84
|
@override
|
|
@@ -144,12 +145,16 @@ class Tokens(FileSyncSettings):
|
|
|
144
145
|
@model_validator(mode="after")
|
|
145
146
|
@override
|
|
146
147
|
def write(self) -> Self:
|
|
148
|
+
# Check if any tokens are missing or if the user wants to reset them, then set tokens if necessary.
|
|
147
149
|
if not self.openai_api_key or Confirm.ask("Reset OpenAI API key?", default=False):
|
|
148
150
|
(self.openai_api_key,) = self.online_token_prompt("https://platform.openai.com/api-keys", "API key")
|
|
149
151
|
|
|
150
152
|
if not all(self.tumblr.model_dump().values()) or Confirm.ask("Reset Tumblr API tokens?", default=False):
|
|
151
153
|
self.tumblr.client_key, self.tumblr.client_secret = self.online_token_prompt("https://tumblr.com/oauth/apps", "consumer key", "consumer secret")
|
|
152
154
|
|
|
155
|
+
# This is the whole OAuth 1.0 process.
|
|
156
|
+
# https://requests-oauthlib.readthedocs.io/en/latest/examples/tumblr.html
|
|
157
|
+
# We tried setting up OAuth 2.0, but the token refresh process is far too unreliable for this sort of program.
|
|
153
158
|
with OAuth1Session(
|
|
154
159
|
self.tumblr.client_key,
|
|
155
160
|
self.tumblr.client_secret,
|
|
@@ -169,12 +174,15 @@ class Tokens(FileSyncSettings):
|
|
|
169
174
|
|
|
170
175
|
self.tumblr.resource_owner_key, self.tumblr.resource_owner_secret = self.get_oauth_tokens(oauth_tokens)
|
|
171
176
|
|
|
177
|
+
# Regardless of whether any values were changed, we may as well write to the keyring.
|
|
178
|
+
# Any unchanged values will be set to the value they already were, since this is run after reading from the keyring.
|
|
172
179
|
set_password(self.service_name, self.username, self.model_dump_json())
|
|
173
180
|
|
|
174
181
|
return self
|
|
175
182
|
|
|
176
183
|
|
|
177
184
|
class Blog(FullyValidatedModel):
|
|
185
|
+
name: str = ""
|
|
178
186
|
posts: int = 0
|
|
179
187
|
uuid: str = ""
|
|
180
188
|
|
|
@@ -205,24 +213,31 @@ class Post(FullyValidatedModel):
|
|
|
205
213
|
|
|
206
214
|
content: SkipJsonSchema[list[Block]] = []
|
|
207
215
|
layout: SkipJsonSchema[list[Block]] = []
|
|
208
|
-
trail: SkipJsonSchema[list[
|
|
216
|
+
trail: SkipJsonSchema[list[Self]] = []
|
|
209
217
|
|
|
210
218
|
is_submission: SkipJsonSchema[bool] = False
|
|
211
219
|
|
|
212
220
|
def __rich__(self) -> Panel:
|
|
213
221
|
return Panel(
|
|
214
|
-
self
|
|
222
|
+
str(self),
|
|
215
223
|
title="Preview",
|
|
216
224
|
subtitle=" ".join(f"#{tag}" for tag in self.tags),
|
|
217
225
|
subtitle_align="left",
|
|
218
226
|
)
|
|
219
227
|
|
|
220
|
-
def
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
def get_content_text(self) -> str:
|
|
228
|
+
def __str__(self) -> str:
|
|
229
|
+
# This function is really only relevant when a post is already valid, so we don't have to check the block types.
|
|
230
|
+
# If it is called on an invalid post, it would also work, but might give strange data.
|
|
224
231
|
return "\n\n".join(block.text for block in self.content)
|
|
225
232
|
|
|
233
|
+
def valid_text_post(self) -> bool:
|
|
234
|
+
# Checks if this post:
|
|
235
|
+
# - has any content blocks (some glitched empty posts have no content)
|
|
236
|
+
# - only has content blocks of type 'text' (this excludes photo/video/poll/etc posts)
|
|
237
|
+
# - is not a submitted post
|
|
238
|
+
# - has no ask blocks in the content
|
|
239
|
+
return bool(self.content) and all(block.type == "text" for block in self.content) and not (self.is_submission or any(block.type == "ask" for block in self.layout))
|
|
240
|
+
|
|
226
241
|
|
|
227
242
|
class Example(FullyValidatedModel):
|
|
228
243
|
class Message(FullyValidatedModel):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|