tumblrbot 1.3.2__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/PKG-INFO +20 -6
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/README.md +16 -3
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/pyproject.toml +4 -3
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/__main__.py +5 -2
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/flow/download.py +26 -26
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/flow/examples.py +62 -63
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/flow/fine_tune.py +48 -60
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/flow/generate.py +30 -30
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/utils/common.py +8 -8
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/utils/config.py +23 -18
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/utils/models.py +3 -3
- tumblrbot-1.4.0/src/tumblrbot/utils/tumblr.py +49 -0
- tumblrbot-1.3.2/src/tumblrbot/utils/tumblr.py +0 -39
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/.github/dependabot.yml +0 -0
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/.gitignore +0 -0
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/UNLICENSE +0 -0
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/__init__.py +0 -0
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/flow/__init__.py +0 -0
- {tumblrbot-1.3.2 → tumblrbot-1.4.0}/src/tumblrbot/utils/__init__.py +0 -0
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tumblrbot
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: An updated bot that posts to Tumblr, based on your very own blog!
|
|
5
5
|
Requires-Python: >= 3.13
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: httpx[http2]
|
|
7
8
|
Requires-Dist: keyring
|
|
8
9
|
Requires-Dist: more-itertools
|
|
10
|
+
Requires-Dist: niquests[speedups, http3]
|
|
9
11
|
Requires-Dist: openai
|
|
10
12
|
Requires-Dist: pydantic
|
|
11
13
|
Requires-Dist: pydantic-settings
|
|
12
|
-
Requires-Dist: requests
|
|
14
|
+
Requires-Dist: requests-cache
|
|
13
15
|
Requires-Dist: requests-oauthlib
|
|
14
16
|
Requires-Dist: rich
|
|
15
|
-
Requires-Dist: tenacity
|
|
16
17
|
Requires-Dist: tiktoken
|
|
17
18
|
Requires-Dist: tomlkit
|
|
18
19
|
Project-URL: Source, https://github.com/MaidThatPrograms/tumblrbot
|
|
@@ -71,8 +72,7 @@ Features:
|
|
|
71
72
|
- Automatically keeps the [config] file up-to-date and recreates it if missing.
|
|
72
73
|
|
|
73
74
|
**To-Do:**
|
|
74
|
-
- Add documentation.
|
|
75
|
-
- Finish updating [README.md].
|
|
75
|
+
- Add code documentation.
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
**Please submit an issue or contact us for features you want added/reimplemented.**
|
|
@@ -113,5 +113,19 @@ After inputting the [Tumblr] tokens, you will be given a URL that you need to op
|
|
|
113
113
|
|
|
114
114
|
## Configuration
|
|
115
115
|
All config options can be found in `config.toml` after running the program once. This will be kept up-to-date if there are changes to the config's format in a future update. This also means it may be worthwhile to double-check the config file after an update. Any changes to the config should be in the changelog for a given version.
|
|
116
|
-
|
|
116
|
+
|
|
117
|
+
All file options can include directories that will be created when the program is run.
|
|
118
|
+
|
|
119
|
+
- `custom_prompts_file` You will have to create this file yourself. It should follow the following format:
|
|
120
|
+
```json
|
|
121
|
+
{"user message 1": "assistant response 1",
|
|
122
|
+
"user message 2": "assistant response 2"}
|
|
123
|
+
```
|
|
124
|
+
- **`developer_message`** - This message is used in for fine-tuning the AI as well as generating prompts. If you change this, you will need to run the fine-tuning again with the new value before generating posts.
|
|
125
|
+
- **`user_message`** - This message is used in the same way as `developer_message` and should be treated the same.
|
|
126
|
+
- **`expected_epochs`** - The default value here is the default number of epochs for `base_model`. You may have to change this value if you change `base_model`. After running fine-tuning once, you will see the number of epochs used in the [fine-tuning portal](https://platform.openai.com/finetune) under *Hyperparameters*. This value will also be updated automatically if you run fine-tuning through this program.
|
|
127
|
+
- **`token_price`** - The default value here is the default token price for `base_model`. You can find the up-to-date value [here](https://platform.openai.com/docs/pricing#fine-tuning), in the *Training* column.
|
|
128
|
+
- **`job_id`** - If there is any value here, this program will resume monitoring the corresponding job, instead of starting a new one. This gets set when starting the fine-tuning and is cleared when it is completed. You can find job IDs in the [fine-tuning portal](https://platform.openai.com/finetune).
|
|
129
|
+
- **`base_model`** - This value is used to choose the tokenizer for estimating fine-tuning costs. It is also the base model that will be fine-tuned and the model that is used to generate tags. You can find a list of options in the [fine-tuning portal](https://platform.openai.com/finetune) by pressing *+ Create* and opening the drop-down list for *Base Model*. Be sure to update `token_price` if you change this value.
|
|
130
|
+
- **`tags_chance`** - This should be between 0 and 1. Setting it to 0 corresponds to a 0% chance (never) to add tags to a post. 1 corresponds to a 100% chance (always) to add tags to a post. Adding tags incurs a very small token cost.
|
|
117
131
|
|
|
@@ -52,8 +52,7 @@ Features:
|
|
|
52
52
|
- Automatically keeps the [config] file up-to-date and recreates it if missing.
|
|
53
53
|
|
|
54
54
|
**To-Do:**
|
|
55
|
-
- Add documentation.
|
|
56
|
-
- Finish updating [README.md].
|
|
55
|
+
- Add code documentation.
|
|
57
56
|
|
|
58
57
|
|
|
59
58
|
**Please submit an issue or contact us for features you want added/reimplemented.**
|
|
@@ -94,4 +93,18 @@ After inputting the [Tumblr] tokens, you will be given a URL that you need to op
|
|
|
94
93
|
|
|
95
94
|
## Configuration
|
|
96
95
|
All config options can be found in `config.toml` after running the program once. This will be kept up-to-date if there are changes to the config's format in a future update. This also means it may be worthwhile to double-check the config file after an update. Any changes to the config should be in the changelog for a given version.
|
|
97
|
-
|
|
96
|
+
|
|
97
|
+
All file options can include directories that will be created when the program is run.
|
|
98
|
+
|
|
99
|
+
- `custom_prompts_file` You will have to create this file yourself. It should follow the following format:
|
|
100
|
+
```json
|
|
101
|
+
{"user message 1": "assistant response 1",
|
|
102
|
+
"user message 2": "assistant response 2"}
|
|
103
|
+
```
|
|
104
|
+
- **`developer_message`** - This message is used in for fine-tuning the AI as well as generating prompts. If you change this, you will need to run the fine-tuning again with the new value before generating posts.
|
|
105
|
+
- **`user_message`** - This message is used in the same way as `developer_message` and should be treated the same.
|
|
106
|
+
- **`expected_epochs`** - The default value here is the default number of epochs for `base_model`. You may have to change this value if you change `base_model`. After running fine-tuning once, you will see the number of epochs used in the [fine-tuning portal](https://platform.openai.com/finetune) under *Hyperparameters*. This value will also be updated automatically if you run fine-tuning through this program.
|
|
107
|
+
- **`token_price`** - The default value here is the default token price for `base_model`. You can find the up-to-date value [here](https://platform.openai.com/docs/pricing#fine-tuning), in the *Training* column.
|
|
108
|
+
- **`job_id`** - If there is any value here, this program will resume monitoring the corresponding job, instead of starting a new one. This gets set when starting the fine-tuning and is cleared when it is completed. You can find job IDs in the [fine-tuning portal](https://platform.openai.com/finetune).
|
|
109
|
+
- **`base_model`** - This value is used to choose the tokenizer for estimating fine-tuning costs. It is also the base model that will be fine-tuned and the model that is used to generate tags. You can find a list of options in the [fine-tuning portal](https://platform.openai.com/finetune) by pressing *+ Create* and opening the drop-down list for *Base Model*. Be sure to update `token_price` if you change this value.
|
|
110
|
+
- **`tags_chance`** - This should be between 0 and 1. Setting it to 0 corresponds to a 0% chance (never) to add tags to a post. 1 corresponds to a 100% chance (always) to add tags to a post. Adding tags incurs a very small token cost.
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "tumblrbot"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.4.0"
|
|
4
4
|
description = "An updated bot that posts to Tumblr, based on your very own blog!"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">= 3.13"
|
|
7
7
|
dependencies = [
|
|
8
|
+
"httpx[http2]",
|
|
8
9
|
"keyring",
|
|
9
10
|
"more-itertools",
|
|
11
|
+
"niquests[speedups,http3]",
|
|
10
12
|
"openai",
|
|
11
13
|
"pydantic",
|
|
12
14
|
"pydantic-settings",
|
|
13
|
-
"requests",
|
|
15
|
+
"requests-cache",
|
|
14
16
|
"requests-oauthlib",
|
|
15
17
|
"rich",
|
|
16
|
-
"tenacity",
|
|
17
18
|
"tiktoken",
|
|
18
19
|
"tomlkit",
|
|
19
20
|
]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from openai import OpenAI
|
|
1
|
+
from openai import DefaultHttpxClient, OpenAI
|
|
2
2
|
from rich.prompt import Confirm
|
|
3
3
|
from rich.traceback import install
|
|
4
4
|
|
|
@@ -14,7 +14,10 @@ def main() -> None:
|
|
|
14
14
|
install()
|
|
15
15
|
|
|
16
16
|
tokens = Tokens()
|
|
17
|
-
with
|
|
17
|
+
with (
|
|
18
|
+
OpenAI(api_key=tokens.openai_api_key.get_secret_value(), http_client=DefaultHttpxClient(http2=True)) as openai,
|
|
19
|
+
TumblrClient(tokens=tokens) as tumblr,
|
|
20
|
+
):
|
|
18
21
|
post_downloader = PostDownloader(openai, tumblr)
|
|
19
22
|
if Confirm.ask("Download latest posts?", default=False):
|
|
20
23
|
post_downloader.download()
|
|
@@ -7,32 +7,6 @@ from tumblrbot.utils.models import Post
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class PostDownloader(FlowClass):
|
|
10
|
-
def paginate_posts(self, blog_identifier: str, completed: int, after: int, fp: TextIOBase, live: PreviewLive) -> None:
|
|
11
|
-
task_id = live.progress.add_task(f"Downloading posts from '{blog_identifier}'...", total=None, completed=completed)
|
|
12
|
-
|
|
13
|
-
while True:
|
|
14
|
-
response = self.tumblr.retrieve_published_posts(blog_identifier, after=after).json()["response"]
|
|
15
|
-
live.progress.update(task_id, total=response["blog"]["posts"], completed=completed)
|
|
16
|
-
|
|
17
|
-
if posts := response["posts"]:
|
|
18
|
-
for post in posts:
|
|
19
|
-
dump(post, fp)
|
|
20
|
-
fp.write("\n")
|
|
21
|
-
|
|
22
|
-
model = Post.model_validate(post)
|
|
23
|
-
after = model.timestamp
|
|
24
|
-
live.custom_update(model)
|
|
25
|
-
|
|
26
|
-
completed += len(posts)
|
|
27
|
-
else:
|
|
28
|
-
return
|
|
29
|
-
|
|
30
|
-
def get_data_path(self, blog_identifier: str) -> Path:
|
|
31
|
-
return (self.config.data_directory / blog_identifier).with_suffix(".jsonl")
|
|
32
|
-
|
|
33
|
-
def get_data_paths(self) -> list[Path]:
|
|
34
|
-
return list(map(self.get_data_path, self.config.download_blog_identifiers))
|
|
35
|
-
|
|
36
10
|
def download(self) -> None:
|
|
37
11
|
self.config.data_directory.mkdir(parents=True, exist_ok=True)
|
|
38
12
|
|
|
@@ -56,3 +30,29 @@ class PostDownloader(FlowClass):
|
|
|
56
30
|
fp,
|
|
57
31
|
live,
|
|
58
32
|
)
|
|
33
|
+
|
|
34
|
+
def paginate_posts(self, blog_identifier: str, completed: int, after: int, fp: TextIOBase, live: PreviewLive) -> None:
|
|
35
|
+
task_id = live.progress.add_task(f"Downloading posts from '{blog_identifier}'...", total=None, completed=completed)
|
|
36
|
+
|
|
37
|
+
while True:
|
|
38
|
+
response = self.tumblr.retrieve_published_posts(blog_identifier, after=after).json()["response"]
|
|
39
|
+
live.progress.update(task_id, total=response["blog"]["posts"], completed=completed)
|
|
40
|
+
|
|
41
|
+
if posts := response["posts"]:
|
|
42
|
+
for post in posts:
|
|
43
|
+
dump(post, fp)
|
|
44
|
+
fp.write("\n")
|
|
45
|
+
|
|
46
|
+
model = Post.model_validate(post)
|
|
47
|
+
after = model.timestamp
|
|
48
|
+
live.custom_update(model)
|
|
49
|
+
|
|
50
|
+
completed += len(posts)
|
|
51
|
+
else:
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
def get_data_paths(self) -> list[Path]:
|
|
55
|
+
return list(map(self.get_data_path, self.config.download_blog_identifiers))
|
|
56
|
+
|
|
57
|
+
def get_data_path(self, blog_identifier: str) -> Path:
|
|
58
|
+
return (self.config.data_directory / blog_identifier).with_suffix(".jsonl")
|
|
@@ -8,7 +8,7 @@ from typing import IO
|
|
|
8
8
|
|
|
9
9
|
import rich
|
|
10
10
|
from more_itertools import chunked
|
|
11
|
-
from openai import BadRequestError
|
|
11
|
+
from openai import BadRequestError
|
|
12
12
|
from rich.console import Console
|
|
13
13
|
from rich.prompt import Confirm
|
|
14
14
|
from tiktoken import encoding_for_model, get_encoding
|
|
@@ -19,42 +19,42 @@ from tumblrbot.utils.models import Example, Post
|
|
|
19
19
|
|
|
20
20
|
@dataclass
|
|
21
21
|
class ExamplesWriter(FlowClass):
|
|
22
|
-
openai: OpenAI
|
|
23
22
|
data_paths: list[Path]
|
|
24
23
|
|
|
25
|
-
def
|
|
26
|
-
|
|
27
|
-
# and https://cookbook.openai.com/examples/chat_finetuning_data_prep
|
|
28
|
-
try:
|
|
29
|
-
encoding = encoding_for_model(self.config.base_model)
|
|
30
|
-
except KeyError as error:
|
|
31
|
-
encoding = get_encoding("o200k_base")
|
|
32
|
-
Console(stderr=True, style="logging.level.warning").print(f"[Warning] Using encoding '{encoding.name}': {''.join(error.args)}\n")
|
|
24
|
+
def write_examples(self) -> None:
|
|
25
|
+
self.config.examples_file.parent.mkdir(parents=True, exist_ok=True)
|
|
33
26
|
|
|
34
|
-
with self.config.examples_file.open(encoding="utf_8") as fp:
|
|
35
|
-
for
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
27
|
+
with self.config.examples_file.open("w", encoding="utf_8") as fp:
|
|
28
|
+
for user_message, assistant_response in self.get_custom_prompts():
|
|
29
|
+
self.write_example(
|
|
30
|
+
user_message,
|
|
31
|
+
assistant_response,
|
|
32
|
+
fp,
|
|
33
|
+
)
|
|
40
34
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
if match := search(r"(\d+)\.", message):
|
|
48
|
-
return int(match.group(1))
|
|
49
|
-
return test_n
|
|
35
|
+
for post in self.get_filtered_posts():
|
|
36
|
+
self.write_example(
|
|
37
|
+
self.config.user_message,
|
|
38
|
+
post.get_content_text(),
|
|
39
|
+
fp,
|
|
40
|
+
)
|
|
50
41
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
42
|
+
rich.print(f"[bold]The examples file can be found at: '{self.config.examples_file}'\n")
|
|
43
|
+
|
|
44
|
+
def write_example(self, user_message: str, assistant_message: str, fp: IO[str]) -> None:
|
|
45
|
+
example = Example(
|
|
46
|
+
messages=[
|
|
47
|
+
Example.Message(role="developer", content=self.config.developer_message),
|
|
48
|
+
Example.Message(role="user", content=user_message),
|
|
49
|
+
Example.Message(role="assistant", content=assistant_message),
|
|
50
|
+
],
|
|
51
|
+
)
|
|
52
|
+
fp.write(f"{example.model_dump_json()}\n")
|
|
53
|
+
|
|
54
|
+
def get_custom_prompts(self) -> Generator[tuple[str, str]]:
|
|
55
|
+
if self.config.custom_prompts_file.exists():
|
|
56
|
+
text = self.config.custom_prompts_file.read_text(encoding="utf_8")
|
|
57
|
+
yield from loads(text).items()
|
|
58
58
|
|
|
59
59
|
def get_filtered_posts(self) -> Generator[Post]:
|
|
60
60
|
posts = list(self.get_valid_posts())
|
|
@@ -79,37 +79,36 @@ class ExamplesWriter(FlowClass):
|
|
|
79
79
|
else:
|
|
80
80
|
yield from posts
|
|
81
81
|
|
|
82
|
-
def
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
with self.config.examples_file.open("w", encoding="utf_8") as fp:
|
|
91
|
-
for post in self.get_filtered_posts():
|
|
92
|
-
self.write_example(
|
|
93
|
-
self.config.user_message,
|
|
94
|
-
post.get_content_text(),
|
|
95
|
-
fp,
|
|
96
|
-
)
|
|
82
|
+
def get_valid_posts(self) -> Generator[Post]:
|
|
83
|
+
for data_path in self.data_paths:
|
|
84
|
+
with data_path.open(encoding="utf_8") as fp:
|
|
85
|
+
for line in fp:
|
|
86
|
+
post = Post.model_validate_json(line)
|
|
87
|
+
if not (post.is_submission or post.trail) and post.only_text_blocks() and post.get_content_text():
|
|
88
|
+
yield post
|
|
97
89
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
90
|
+
def get_moderation_chunk_limit(self) -> int:
|
|
91
|
+
test_n = 1000
|
|
92
|
+
try:
|
|
93
|
+
self.openai.moderations.create(input=[""] * test_n)
|
|
94
|
+
except BadRequestError as error:
|
|
95
|
+
message = error.response.json()["error"]["message"]
|
|
96
|
+
if match := search(r"(\d+)\.", message):
|
|
97
|
+
return int(match.group(1))
|
|
98
|
+
return test_n
|
|
104
99
|
|
|
105
|
-
|
|
100
|
+
def count_tokens(self) -> Generator[int]:
|
|
101
|
+
# Based on https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
|
|
102
|
+
# and https://cookbook.openai.com/examples/chat_finetuning_data_prep
|
|
103
|
+
try:
|
|
104
|
+
encoding = encoding_for_model(self.config.base_model)
|
|
105
|
+
except KeyError as error:
|
|
106
|
+
encoding = get_encoding("o200k_base")
|
|
107
|
+
Console(stderr=True, style="logging.level.warning").print(f"[Warning] Using encoding '{encoding.name}': {''.join(error.args)}\n")
|
|
106
108
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
],
|
|
114
|
-
)
|
|
115
|
-
fp.write(f"{example.model_dump_json()}\n")
|
|
109
|
+
with self.config.examples_file.open(encoding="utf_8") as fp:
|
|
110
|
+
for line in fp:
|
|
111
|
+
example = Example.model_validate_json(line)
|
|
112
|
+
yield len(encoding.encode("assistant")) # every reply is primed with <|start|>assistant<|message|>
|
|
113
|
+
for message in example.messages:
|
|
114
|
+
yield 4 + len(encoding.encode(message.content))
|
|
@@ -4,10 +4,7 @@ from textwrap import dedent
|
|
|
4
4
|
from time import sleep
|
|
5
5
|
|
|
6
6
|
import rich
|
|
7
|
-
from openai import BadRequestError
|
|
8
|
-
from openai.types import FileObject
|
|
9
7
|
from openai.types.fine_tuning import FineTuningJob
|
|
10
|
-
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed, wait_random
|
|
11
8
|
|
|
12
9
|
from tumblrbot.utils.common import FlowClass, PreviewLive
|
|
13
10
|
|
|
@@ -20,46 +17,33 @@ class FineTuner(FlowClass):
|
|
|
20
17
|
def dedent_print(text: str) -> None:
|
|
21
18
|
rich.print(dedent(text).lstrip())
|
|
22
19
|
|
|
23
|
-
def
|
|
24
|
-
|
|
25
|
-
self.
|
|
26
|
-
Trained Tokens: {job.trained_tokens:,}
|
|
27
|
-
Cost: {self.get_cost_string(job.trained_tokens)}
|
|
28
|
-
""")
|
|
29
|
-
|
|
30
|
-
self.config.job_id = ""
|
|
20
|
+
def fine_tune(self) -> None:
|
|
21
|
+
with PreviewLive() as live:
|
|
22
|
+
job = self.create_job(live)
|
|
31
23
|
|
|
32
|
-
|
|
33
|
-
|
|
24
|
+
self.dedent_print(f"""
|
|
25
|
+
[bold]Fine-tuning is starting...[/]
|
|
26
|
+
View it online at: https://platform.openai.com/finetune/{job.id}
|
|
27
|
+
Created at: {datetime.fromtimestamp(job.created_at)}
|
|
28
|
+
Base Model: {job.model}
|
|
34
29
|
|
|
35
|
-
|
|
36
|
-
|
|
30
|
+
[italic dim]Closing this terminal will not stop the fine-tuning. This will take a while...
|
|
31
|
+
""") # noqa: DTZ006
|
|
37
32
|
|
|
38
|
-
|
|
39
|
-
job = self.openai.fine_tuning.jobs.retrieve(self.config.job_id)
|
|
33
|
+
task_id = live.progress.add_task("", total=None)
|
|
40
34
|
|
|
41
|
-
|
|
42
|
-
|
|
35
|
+
while job.status not in {"succeeded", "failed", "cancelled"}:
|
|
36
|
+
job = self.poll_job_status()
|
|
43
37
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
38
|
+
live.progress.update(
|
|
39
|
+
task_id,
|
|
40
|
+
total=job.estimated_finish,
|
|
41
|
+
description=f"Fine-tuning: [italic]{job.status.replace('_', ' ').title()}[/]...",
|
|
42
|
+
)
|
|
49
43
|
|
|
50
|
-
|
|
44
|
+
sleep(1)
|
|
51
45
|
|
|
52
|
-
|
|
53
|
-
stop=stop_after_attempt(5),
|
|
54
|
-
wait=wait_fixed(1.5) + wait_random(),
|
|
55
|
-
retry=retry_if_exception_type(BadRequestError),
|
|
56
|
-
reraise=True,
|
|
57
|
-
)
|
|
58
|
-
def attempt_submit_job(self, file: FileObject) -> FineTuningJob:
|
|
59
|
-
return self.openai.fine_tuning.jobs.create(
|
|
60
|
-
model=self.config.base_model,
|
|
61
|
-
training_file=file.id,
|
|
62
|
-
)
|
|
46
|
+
self.process_completed_job(job)
|
|
63
47
|
|
|
64
48
|
def create_job(self, live: PreviewLive) -> FineTuningJob:
|
|
65
49
|
if self.config.job_id:
|
|
@@ -71,41 +55,42 @@ class FineTuner(FlowClass):
|
|
|
71
55
|
purpose="fine-tune",
|
|
72
56
|
)
|
|
73
57
|
|
|
74
|
-
job = self.
|
|
58
|
+
job = self.openai.fine_tuning.jobs.create(
|
|
59
|
+
model=self.config.base_model,
|
|
60
|
+
training_file=file.id,
|
|
61
|
+
)
|
|
75
62
|
|
|
76
63
|
self.config.job_id = job.id
|
|
77
64
|
return job
|
|
78
65
|
|
|
79
|
-
def
|
|
80
|
-
|
|
81
|
-
job = self.create_job(live)
|
|
82
|
-
|
|
83
|
-
self.dedent_print(f"""
|
|
84
|
-
[bold]Fine-tuning is starting...[/]
|
|
85
|
-
View it online at: https://platform.openai.com/finetune/{job.id}
|
|
86
|
-
Created at: {datetime.fromtimestamp(job.created_at)}
|
|
87
|
-
Base Model: {job.model}
|
|
66
|
+
def poll_job_status(self) -> FineTuningJob:
|
|
67
|
+
job = self.openai.fine_tuning.jobs.retrieve(self.config.job_id)
|
|
88
68
|
|
|
89
|
-
|
|
90
|
-
|
|
69
|
+
if self.config.expected_epochs != job.hyperparameters.n_epochs and isinstance(job.hyperparameters.n_epochs, int):
|
|
70
|
+
self.config.expected_epochs = job.hyperparameters.n_epochs
|
|
91
71
|
|
|
92
|
-
|
|
72
|
+
self.dedent_print(f"""
|
|
73
|
+
The number of epochs has been updated to {job.hyperparameters.n_epochs}!
|
|
74
|
+
[cyan]Updated the config.
|
|
75
|
+
""")
|
|
76
|
+
self.print_estimates()
|
|
93
77
|
|
|
94
|
-
|
|
95
|
-
job = self.poll_job_status()
|
|
78
|
+
return job
|
|
96
79
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
)
|
|
80
|
+
def process_completed_job(self, job: FineTuningJob) -> None:
|
|
81
|
+
if job.trained_tokens is not None:
|
|
82
|
+
self.dedent_print(f"""
|
|
83
|
+
Trained Tokens: {job.trained_tokens:,}
|
|
84
|
+
Cost: {self.get_cost_string(job.trained_tokens)}
|
|
85
|
+
""")
|
|
102
86
|
|
|
103
|
-
|
|
87
|
+
self.config.job_id = ""
|
|
104
88
|
|
|
105
|
-
|
|
89
|
+
if job.status == "failed" and job.error is not None:
|
|
90
|
+
raise RuntimeError(job.error.message)
|
|
106
91
|
|
|
107
|
-
|
|
108
|
-
|
|
92
|
+
if job.fine_tuned_model:
|
|
93
|
+
self.config.fine_tuned_model = job.fine_tuned_model or ""
|
|
109
94
|
|
|
110
95
|
def print_estimates(self) -> None:
|
|
111
96
|
total_tokens = self.config.expected_epochs * self.estimated_tokens
|
|
@@ -118,3 +103,6 @@ class FineTuner(FlowClass):
|
|
|
118
103
|
NOTE: Token values are approximate and may not be 100% accurate, please be aware of this when using the data.
|
|
119
104
|
[italic red]Amelia, Mutsumi, and Marin are not responsible for any inaccuracies in the token count or estimated price.[/]
|
|
120
105
|
""")
|
|
106
|
+
|
|
107
|
+
def get_cost_string(self, total_tokens: int) -> str:
|
|
108
|
+
return f"${self.config.token_price / 1000000 * total_tokens:.2f}"
|
|
@@ -7,26 +7,20 @@ from tumblrbot.utils.models import Post
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class DraftGenerator(FlowClass):
|
|
10
|
-
def
|
|
11
|
-
|
|
12
|
-
return self.openai.responses.parse(
|
|
13
|
-
input=f"Extract the most important subjects from the following text:\n\n{content.text}",
|
|
14
|
-
model=self.config.base_model,
|
|
15
|
-
text_format=Post,
|
|
16
|
-
instructions="You are an advanced text summarization tool. You return the requested data to the user as a list of comma-separated strings.",
|
|
17
|
-
temperature=0.5,
|
|
18
|
-
).output_parsed
|
|
19
|
-
|
|
20
|
-
return None
|
|
10
|
+
def create_drafts(self) -> None:
|
|
11
|
+
message = f"View drafts here: https://tumblr.com/blog/{self.config.upload_blog_identifier}/drafts"
|
|
21
12
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
13
|
+
with PreviewLive() as live:
|
|
14
|
+
for i in live.progress.track(range(self.config.draft_count), description="Generating drafts..."):
|
|
15
|
+
try:
|
|
16
|
+
post = self.generate_post()
|
|
17
|
+
self.tumblr.create_post(self.config.upload_blog_identifier, post)
|
|
18
|
+
live.custom_update(post)
|
|
19
|
+
except BaseException as exception:
|
|
20
|
+
exception.add_note(f"📉 An error occurred! Generated {i} draft(s) before failing. {message}")
|
|
21
|
+
raise
|
|
28
22
|
|
|
29
|
-
|
|
23
|
+
rich.print(f":chart_increasing: [bold green]Generated {self.config.draft_count} draft(s).[/] {message}")
|
|
30
24
|
|
|
31
25
|
def generate_post(self) -> Post:
|
|
32
26
|
content = self.generate_content()
|
|
@@ -38,17 +32,23 @@ class DraftGenerator(FlowClass):
|
|
|
38
32
|
post.tags = tags.tags
|
|
39
33
|
return post
|
|
40
34
|
|
|
41
|
-
def
|
|
42
|
-
|
|
35
|
+
def generate_content(self) -> Post.Block:
|
|
36
|
+
content = self.openai.responses.create(
|
|
37
|
+
input=self.config.user_message,
|
|
38
|
+
instructions=self.config.developer_message,
|
|
39
|
+
model=self.config.fine_tuned_model,
|
|
40
|
+
).output_text
|
|
43
41
|
|
|
44
|
-
|
|
45
|
-
for i in live.progress.track(range(self.config.draft_count), description="Generating drafts..."):
|
|
46
|
-
try:
|
|
47
|
-
post = self.generate_post()
|
|
48
|
-
self.tumblr.create_post(self.config.upload_blog_identifier, post)
|
|
49
|
-
live.custom_update(post)
|
|
50
|
-
except BaseException as exception:
|
|
51
|
-
exception.add_note(f"📉 An error occurred! Generated {i} draft(s) before failing. {message}")
|
|
52
|
-
raise
|
|
42
|
+
return Post.Block(type="text", text=content)
|
|
53
43
|
|
|
54
|
-
|
|
44
|
+
def generate_tags(self, content: Post.Block) -> Post | None:
|
|
45
|
+
if random() < self.config.tags_chance: # noqa: S311
|
|
46
|
+
return self.openai.responses.parse(
|
|
47
|
+
text_format=Post,
|
|
48
|
+
input=f"Extract the most important subjects from the following text:\n\n{content.text}",
|
|
49
|
+
instructions="You are an advanced text summarization tool. You return the requested data to the user as a list of comma-separated strings.",
|
|
50
|
+
model=self.config.base_model,
|
|
51
|
+
temperature=0.5,
|
|
52
|
+
).output_parsed
|
|
53
|
+
|
|
54
|
+
return None
|
|
@@ -13,6 +13,14 @@ from tumblrbot.utils.config import Config
|
|
|
13
13
|
from tumblrbot.utils.tumblr import TumblrClient
|
|
14
14
|
|
|
15
15
|
|
|
16
|
+
@dataclass
|
|
17
|
+
class FlowClass:
|
|
18
|
+
config: ClassVar = Config() # pyright: ignore[reportCallIssue]
|
|
19
|
+
|
|
20
|
+
openai: OpenAI
|
|
21
|
+
tumblr: TumblrClient
|
|
22
|
+
|
|
23
|
+
|
|
16
24
|
class PreviewLive(Live):
|
|
17
25
|
def __init__(self) -> None:
|
|
18
26
|
super().__init__()
|
|
@@ -38,11 +46,3 @@ class PreviewLive(Live):
|
|
|
38
46
|
table.add_row(self.progress)
|
|
39
47
|
table.add_row(*renderables)
|
|
40
48
|
self.update(table)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@dataclass
|
|
44
|
-
class FlowClass:
|
|
45
|
-
config: ClassVar = Config() # pyright: ignore[reportCallIssue]
|
|
46
|
-
|
|
47
|
-
openai: OpenAI
|
|
48
|
-
tumblr: TumblrClient
|
|
@@ -26,28 +26,33 @@ class Config(BaseSettings):
|
|
|
26
26
|
toml_file="config.toml",
|
|
27
27
|
)
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
"",
|
|
32
|
-
description="The identifier of the blog which generated drafts will be uploaded to. This must be a blog associated with the same account as the configured Tumblr secret tokens.",
|
|
33
|
-
)
|
|
34
|
-
draft_count: PositiveInt = Field(150, description="The number of drafts to process. This will affect the number of tokens used with OpenAI")
|
|
35
|
-
tags_chance: NonNegativeFloat = Field(0.1, description="The chance to generate tags for any given post. This will incur extra calls to OpenAI.")
|
|
36
|
-
|
|
37
|
-
download_blog_identifiers: list[str] = Field(
|
|
38
|
-
[],
|
|
39
|
-
description="The identifiers of the blogs which post data will be downloaded from. These must be blogs associated with the same account as the configured Tumblr secret tokens.",
|
|
40
|
-
)
|
|
29
|
+
# Downloading Posts & Writing Examples
|
|
30
|
+
download_blog_identifiers: list[str] = Field([], description="The identifiers of the blogs which post data will be downloaded from. These must be blogs associated with the same account as the configured Tumblr secret tokens.")
|
|
41
31
|
data_directory: Path = Field(Path("data"), description="Where to store downloaded post data.")
|
|
32
|
+
|
|
33
|
+
# Writing Examples
|
|
42
34
|
custom_prompts_file: Path = Field(Path("custom_prompts.json"), description="Where to read in custom prompts from.")
|
|
35
|
+
|
|
36
|
+
# Writing Examples & Fine-Tuning
|
|
43
37
|
examples_file: Path = Field(Path("examples.jsonl"), description="Where to output the examples that will be used to fine-tune the model.")
|
|
44
|
-
|
|
38
|
+
|
|
39
|
+
# Writing Examples & Generating
|
|
40
|
+
developer_message: str = Field("You are a Tumblr post bot. Please generate a Tumblr post in accordance with the user's request.", description="The developer message used by the OpenAI API to generate drafts.")
|
|
41
|
+
user_message: str = Field("Please write a comical Tumblr post.", description="The user input used by the OpenAI API to generate drafts.")
|
|
42
|
+
|
|
43
|
+
# Fine-Tuning
|
|
45
44
|
expected_epochs: PositiveInt = Field(3, description="The expected number of epochs fine-tuning will be run for. This will be updated during fine-tuning.")
|
|
46
45
|
token_price: PositiveFloat = Field(3, description="The expected price in USD per million tokens during fine-tuning for the current model.")
|
|
46
|
+
job_id: str = Field("", description="The fine-tuning job ID that will be polled on next run.")
|
|
47
47
|
|
|
48
|
+
# Fine-Tuning & Generating
|
|
48
49
|
base_model: ChatModel = Field("gpt-4o-mini-2024-07-18", description="The name of the model that will be fine-tuned by the generated training data.")
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
fine_tuned_model: str = Field("", description="The name of the OpenAI model that was fine-tuned with your posts.")
|
|
51
|
+
|
|
52
|
+
# Generating
|
|
53
|
+
upload_blog_identifier: str = Field("", description="The identifier of the blog which generated drafts will be uploaded to. This must be a blog associated with the same account as the configured Tumblr secret tokens.")
|
|
54
|
+
draft_count: PositiveInt = Field(150, description="The number of drafts to process. This will affect the number of tokens used with OpenAI")
|
|
55
|
+
tags_chance: NonNegativeFloat = Field(0.1, description="The chance to generate tags for any given post. This will incur extra calls to OpenAI.")
|
|
51
56
|
|
|
52
57
|
@override
|
|
53
58
|
@classmethod
|
|
@@ -58,11 +63,11 @@ class Config(BaseSettings):
|
|
|
58
63
|
def write_to_file(self) -> Self:
|
|
59
64
|
if not self.download_blog_identifiers:
|
|
60
65
|
rich.print("Enter the [cyan]identifiers of your blogs[/] that data should be [bold purple]downloaded[/] from, separated by commas.")
|
|
61
|
-
self.download_blog_identifiers = list(map(str.strip, Prompt.ask("[bold]Example
|
|
66
|
+
self.download_blog_identifiers = list(map(str.strip, Prompt.ask("[bold][Example] [dim]staff.tumblr.com,changes").split(",")))
|
|
62
67
|
|
|
63
68
|
if not self.upload_blog_identifier:
|
|
64
69
|
rich.print("Enter the [cyan]identifier of your blog[/] that drafts should be [bold purple]uploaded[/] to.")
|
|
65
|
-
self.upload_blog_identifier = Prompt.ask("[bold]
|
|
70
|
+
self.upload_blog_identifier = Prompt.ask("[bold][Example] [dim]staff.tumblr.com or changes").strip()
|
|
66
71
|
|
|
67
72
|
toml_files = self.model_config.get("toml_file")
|
|
68
73
|
if isinstance(toml_files, (Path, str)):
|
|
@@ -86,6 +91,6 @@ class Config(BaseSettings):
|
|
|
86
91
|
toml_table[name] = value.get_secret_value() if isinstance(value, Secret) else dumped_model[name]
|
|
87
92
|
|
|
88
93
|
Path(toml_file).write_text(
|
|
89
|
-
tomlkit.dumps(toml_table),
|
|
94
|
+
tomlkit.dumps(toml_table),
|
|
90
95
|
encoding="utf_8",
|
|
91
96
|
)
|
|
@@ -57,8 +57,8 @@ class Tokens(FullyValidatedModel):
|
|
|
57
57
|
self.tumblr_client_key, self.tumblr_client_secret = self.online_token_prompt("https://tumblr.com/oauth/apps", "consumer key", "consumer secret")
|
|
58
58
|
|
|
59
59
|
oauth_session = OAuth1Session(*self.get_tumblr_tokens()[:2])
|
|
60
|
-
fetch_response = oauth_session.fetch_request_token("http://tumblr.com/oauth/request_token")
|
|
61
|
-
full_authorize_url = oauth_session.authorization_url("http://tumblr.com/oauth/authorize")
|
|
60
|
+
fetch_response = oauth_session.fetch_request_token("http://tumblr.com/oauth/request_token")
|
|
61
|
+
full_authorize_url = oauth_session.authorization_url("http://tumblr.com/oauth/authorize")
|
|
62
62
|
(redirect_response,) = self.online_token_prompt(full_authorize_url, "full redirect URL")
|
|
63
63
|
oauth_response = oauth_session.parse_authorization_response(redirect_response.get_secret_value())
|
|
64
64
|
oauth_session = OAuth1Session(
|
|
@@ -67,7 +67,7 @@ class Tokens(FullyValidatedModel):
|
|
|
67
67
|
fetch_response["oauth_token_secret"],
|
|
68
68
|
verifier=oauth_response["oauth_verifier"],
|
|
69
69
|
)
|
|
70
|
-
oauth_tokens = oauth_session.fetch_access_token("http://tumblr.com/oauth/access_token")
|
|
70
|
+
oauth_tokens = oauth_session.fetch_access_token("http://tumblr.com/oauth/access_token")
|
|
71
71
|
self.tumblr_resource_owner_key = oauth_tokens["oauth_token"]
|
|
72
72
|
self.tumblr_resource_owner_secret = oauth_tokens["oauth_token_secret"]
|
|
73
73
|
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Self
|
|
3
|
+
|
|
4
|
+
from niquests import HTTPError, PreparedRequest, Response, Session
|
|
5
|
+
from requests_cache import CacheMixin
|
|
6
|
+
from requests_oauthlib import OAuth1
|
|
7
|
+
|
|
8
|
+
from tumblrbot.utils.models import Post, Tokens
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class TumblrClient(Session, CacheMixin): # pyright: ignore[reportIncompatibleMethodOverride, reportIncompatibleVariableOverride]
|
|
13
|
+
tokens: Tokens
|
|
14
|
+
|
|
15
|
+
def __post_init__(self) -> None:
|
|
16
|
+
super().__init__(happy_eyeballs=True)
|
|
17
|
+
CacheMixin.__init__(self, use_cache_dir=True)
|
|
18
|
+
|
|
19
|
+
self.auth = OAuth1(*self.tokens.get_tumblr_tokens())
|
|
20
|
+
self.hooks["response"].append(self.response_hook)
|
|
21
|
+
|
|
22
|
+
def __enter__(self) -> Self:
|
|
23
|
+
super().__enter__()
|
|
24
|
+
return self
|
|
25
|
+
|
|
26
|
+
def response_hook(self, response: PreparedRequest | Response) -> None:
|
|
27
|
+
if isinstance(response, Response):
|
|
28
|
+
try:
|
|
29
|
+
response.raise_for_status()
|
|
30
|
+
except HTTPError as error:
|
|
31
|
+
if response.text:
|
|
32
|
+
error.add_note(response.text)
|
|
33
|
+
raise
|
|
34
|
+
|
|
35
|
+
def retrieve_published_posts(self, blog_identifier: str, after: int) -> Response:
|
|
36
|
+
return self.get(
|
|
37
|
+
f"https://api.tumblr.com/v2/blog/{blog_identifier}/posts",
|
|
38
|
+
params={
|
|
39
|
+
"after": str(after),
|
|
40
|
+
"sort": "asc",
|
|
41
|
+
"npf": str(True),
|
|
42
|
+
},
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def create_post(self, blog_identifier: str, post: Post) -> Response:
|
|
46
|
+
return self.post(
|
|
47
|
+
f"https://api.tumblr.com/v2/blog/{blog_identifier}/posts",
|
|
48
|
+
json=post.model_dump(mode="json"),
|
|
49
|
+
)
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
from requests import HTTPError, Response
|
|
4
|
-
from requests_oauthlib import OAuth1Session
|
|
5
|
-
|
|
6
|
-
from tumblrbot.utils.models import Post, Tokens
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@dataclass
|
|
10
|
-
class TumblrClient(OAuth1Session):
|
|
11
|
-
tokens: Tokens
|
|
12
|
-
|
|
13
|
-
def __post_init__(self) -> None:
|
|
14
|
-
super().__init__(*self.tokens.get_tumblr_tokens()) # pyright: ignore[reportUnknownMemberType]
|
|
15
|
-
|
|
16
|
-
self.hooks["response"].append(self.response_hook)
|
|
17
|
-
|
|
18
|
-
def response_hook(self, response: Response, **_: object) -> None:
|
|
19
|
-
try:
|
|
20
|
-
response.raise_for_status()
|
|
21
|
-
except HTTPError as error:
|
|
22
|
-
error.add_note(response.text)
|
|
23
|
-
raise
|
|
24
|
-
|
|
25
|
-
def retrieve_published_posts(self, blog_identifier: str, after: int) -> Response:
|
|
26
|
-
return self.get(
|
|
27
|
-
f"https://api.tumblr.com/v2/blog/{blog_identifier}/posts",
|
|
28
|
-
params={
|
|
29
|
-
"after": after,
|
|
30
|
-
"sort": "asc",
|
|
31
|
-
"npf": True,
|
|
32
|
-
},
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
def create_post(self, blog_identifier: str, post: Post) -> Response:
|
|
36
|
-
return self.post(
|
|
37
|
-
f"https://api.tumblr.com/v2/blog/{blog_identifier}/posts",
|
|
38
|
-
json=post.model_dump(mode="json"),
|
|
39
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|