tumblrbot 1.9.7__tar.gz → 1.10.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/PKG-INFO +27 -40
  2. tumblrbot-1.10.1/README.md +179 -0
  3. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/pyproject.toml +27 -30
  4. tumblrbot-1.10.1/setup.cfg +4 -0
  5. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/__main__.py +47 -43
  6. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/flow/download.py +55 -55
  7. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/flow/examples.py +94 -97
  8. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/flow/fine_tune.py +141 -137
  9. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/flow/generate.py +97 -97
  10. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/utils/common.py +62 -57
  11. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/utils/models.py +225 -225
  12. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/utils/tumblr.py +83 -68
  13. tumblrbot-1.9.7/README.md → tumblrbot-1.10.1/src/tumblrbot.egg-info/PKG-INFO +197 -190
  14. tumblrbot-1.10.1/src/tumblrbot.egg-info/SOURCES.txt +19 -0
  15. tumblrbot-1.10.1/src/tumblrbot.egg-info/dependency_links.txt +1 -0
  16. tumblrbot-1.10.1/src/tumblrbot.egg-info/entry_points.txt +2 -0
  17. tumblrbot-1.10.1/src/tumblrbot.egg-info/requires.txt +9 -0
  18. tumblrbot-1.10.1/src/tumblrbot.egg-info/top_level.txt +1 -0
  19. tumblrbot-1.9.7/.github/FUNDING.yml +0 -15
  20. tumblrbot-1.9.7/.github/dependabot.yml +0 -11
  21. tumblrbot-1.9.7/.gitignore +0 -221
  22. tumblrbot-1.9.7/UNLICENSE +0 -24
  23. tumblrbot-1.9.7/build.ps1 +0 -8
  24. tumblrbot-1.9.7/sample_custom_prompts.jsonl +0 -1554
  25. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/__init__.py +0 -0
  26. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/flow/__init__.py +0 -0
  27. {tumblrbot-1.9.7 → tumblrbot-1.10.1}/src/tumblrbot/utils/__init__.py +0 -0
@@ -1,9 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tumblrbot
3
- Version: 1.9.7
3
+ Version: 1.10.1
4
4
  Summary: An updated bot that posts to Tumblr, based on your very own blog!
5
- Requires-Python: >= 3.14
5
+ Project-URL: Funding, https://ko-fi.com/maidscientistizutsumimarin
6
+ Project-URL: Source, https://github.com/MaidScientistIzutsumiMarin/tumblrbot
7
+ Requires-Python: >=3.14
6
8
  Description-Content-Type: text/markdown
9
+ Requires-Dist: CurrencyConverter
7
10
  Requires-Dist: openai
8
11
  Requires-Dist: pydantic
9
12
  Requires-Dist: requests
@@ -12,15 +15,9 @@ Requires-Dist: rich
12
15
  Requires-Dist: tenacity
13
16
  Requires-Dist: tiktoken
14
17
  Requires-Dist: tomlkit
15
- Requires-Dist: pyinstaller ; extra == "dev"
16
- Project-URL: Funding, https://ko-fi.com/maidscientistizutsumimarin
17
- Project-URL: Source, https://github.com/MaidScientistIzutsumiMarin/tumblrbot
18
- Provides-Extra: dev
19
18
 
20
19
  # tumblrbot
21
20
 
22
- [tumblrbot.exe]: https://github.com/MaidScientistIzutsumiMarin/tumblrbot/releases/latest/download/tumblrbot.exe
23
-
24
21
  [OAuth]: https://oauth.net/1
25
22
  [Python]: https://python.org/download
26
23
 
@@ -30,11 +27,11 @@ Provides-Extra: dev
30
27
  [pip]: https://pypi.org
31
28
  [Rich]: https://pypi.org/project/rich
32
29
 
33
- [OpenAI]: https://pypi.org/project/openai
30
+ [OpenAI]: https://openai.com
34
31
  [OpenAI Pricing]: https://platform.openai.com/docs/pricing#fine-tuning
35
32
  [OpenAI Tokens]: https://platform.openai.com/settings/organization/api-keys
36
33
  [OpenAI Moderation API]: https://platform.openai.com/docs/guides/moderation
37
- [Flags]: https://platform.openai.com/docs/guides/moderation/over#content-classifications
34
+ [Flags]: https://platform.openai.com/docs/guides/moderation#content-classifications
38
35
  [Fine-Tuning Portal]: https://platform.openai.com/finetune
39
36
 
40
37
  [Tumblr]: https://tumblr.com
@@ -53,6 +50,19 @@ Provides-Extra: dev
53
50
  [Fine-Tuning]: #manual-fine-tuning
54
51
  [![PyPI - Version](https://img.shields.io/pypi/v/tumblrbot)](https://python.org/pypi/tumblrbot)
55
52
 
53
+ ## Installation & Usage
54
+
55
+ 1. Install the latest version of [Python]:
56
+ - Windows: `winget install python3`
57
+ - Linux (apt): `apt install python-pip`
58
+ - Linux (pacman): `pacman install python-pip`
59
+ 1. Install the [pip] package: `pip install tumblrbot`
60
+ - Alternatively, you can install from this repository: `pip install git+https://github.com/MaidScientistIzutsumiMarin/tumblrbot.git`
61
+ - On Linux, you will have to make a virtual environment or use the flag to install packages system-wide.
62
+ 1. Run `tumblrbot` from anywhere. Run `tumblrbot --help` for command-line options. Every command-line option corresponds to a value from the [config][configurable].
63
+
64
+ ---
65
+
56
66
  Description of original project:
57
67
  > 4tv-tumblrbot was a collaborative project I embarked on with my close friend Dima, who goes by @smoqueen on Tumblr. The aim of this endeavor was straightforward yet silly: to develop a Tumblr bot powered by a machine-learning model. This bot would be specifically trained on the content from a particular Tumblr blog or a selected set of blogs, allowing it to mimic the style, tone, and thematic essence of the original posts.
58
68
 
@@ -93,39 +103,17 @@ Features:
93
103
  - You can use regular expressions to filter out training data in the [config][configurable]. This is more of a brute-force solution, but it can work if the other solutions do not.
94
104
  - You can try limiting your dataset by specifying fewer blogs to download from or limiting the number of posts taken from each one in the [config][configurable].
95
105
  - If all else fails, you can manually remove data from the examples file until it passes. It is unfortunately not a definitive resource, but it can help to read about what the [OpenAI moderation API flags][Flags].
96
- - Sometimes, you will get an error about the training file not being found when starting fine-tuning. We do not currently have a fix or workaround for this. You should instead use the online portal for fine-tuning if this continues to happen. Read more in [fine-tuning]
97
- - *We are unsure if this is still happening.*
98
- - Post counts are incorrect when downloading posts. Our tests suggest this is a [Tumblr] API problem that is giving inaccurate numbers, so treat them as estimates.
106
+ - Post counts can be incorrect when downloading posts. Our tests suggest this is a [Tumblr] API problem that is giving inaccurate numbers, so treat them as estimates.
99
107
 
100
- **Please submit an issue or contact us for features you want added/reimplemented.**
101
-
102
- ## Installation & Usage
103
-
104
- ### Downloadable Binary
108
+ **To-Do:**
105
109
 
106
- | Pros | Cons |
107
- | --- | --- |
108
- | Easier to install | Harder to update |
109
- | No risk of dependencies breaking | Dependencies may be older |
110
+ - Add selection menus to make it easier to pick a particular operation.
111
+ - Add retrying to the fine-tuning process (if necessary).
112
+ - Add an option to filter newest posts by a specific date.
110
113
 
111
- 1. Download the latest release's [tumblrbot.exe].
112
- 1. Launch `tumblrbot.exe` in the install location.
113
-
114
- ### PyPi
115
-
116
- | Pros | Cons |
117
- | --- | --- |
118
- | Easier to update | Harder to install |
119
- | Dependencies may be newer | Dependencies may break |
114
+ **Please submit an issue or contact us for features you want added/reimplemented.**
120
115
 
121
- 1. Install the latest version of [Python]:
122
- - Windows: `winget install python3`
123
- - Linux (apt): `apt install python-pip`
124
- - Linux (pacman): `pacman install python-pip`
125
- 1. Install the [pip] package: `pip install tumblrbot`
126
- - Alternatively, you can install from this repository: `pip install git+https://github.com/MaidScientistIzutsumiMarin/tumblrbot.git`
127
- - On Linux, you will have to make a virtual environment or use the flag to install packages system-wide.
128
- 1. Run `tumblrbot` from anywhere. Run `tumblrbot --help` for command-line options. Every command-line option corresponds to a value from the [config][configurable].
116
+ ---
129
117
 
130
118
  ## Obtaining Tokens
131
119
 
@@ -207,4 +195,3 @@ You can manually upload the examples file to [OpenAI] and start the fine-tuning
207
195
  1. Press `Create`.
208
196
  1. (Optional) Copy the value next to `Job ID` and paste it into the [config][configurable] under `job_id`. You can then run the program and monitor its progress as usual.
209
197
  1. If you do not do the above, you will have to copy the value next to `Output model` once the job is complete and paste it into the [config][configurable] under `fine_tuned_model`.
210
-
@@ -0,0 +1,179 @@
1
+ # tumblrbot
2
+
3
+ [OAuth]: https://oauth.net/1
4
+ [Python]: https://python.org/download
5
+
6
+ [JSON Lines]: https://jsonlines.org
7
+ [JSON Lines Validator]: https://jsonlines.org/validator
8
+
9
+ [pip]: https://pypi.org
10
+ [Rich]: https://pypi.org/project/rich
11
+
12
+ [OpenAI]: https://openai.com
13
+ [OpenAI Pricing]: https://platform.openai.com/docs/pricing#fine-tuning
14
+ [OpenAI Tokens]: https://platform.openai.com/settings/organization/api-keys
15
+ [OpenAI Moderation API]: https://platform.openai.com/docs/guides/moderation
16
+ [Flags]: https://platform.openai.com/docs/guides/moderation#content-classifications
17
+ [Fine-Tuning Portal]: https://platform.openai.com/finetune
18
+
19
+ [Tumblr]: https://tumblr.com
20
+ [Tumblr Tokens]: https://tumblr.com/oauth/apps
21
+ [Tumblr API Documentation on Blog Identifiers]: https://tumblr.com/docs/en/api/v2#blog-identifiers
22
+
23
+ [Format String]: https://docs.python.org/3/library/string.html#format-string-syntax
24
+
25
+ [Download]: src/tumblrbot/flow/download.py
26
+ [Examples]: src/tumblrbot/flow/examples.py
27
+ [Fine-Tune]: src/tumblrbot/flow/fine_tune.py
28
+ [Generate]: src/tumblrbot/flow/generate.py
29
+ [Main]: src/tumblrbot/__main__.py
30
+
31
+ [Configurable]: #configuration
32
+ [Fine-Tuning]: #manual-fine-tuning
33
+ [![PyPI - Version](https://img.shields.io/pypi/v/tumblrbot)](https://python.org/pypi/tumblrbot)
34
+
35
+ ## Installation & Usage
36
+
37
+ 1. Install the latest version of [Python]:
38
+ - Windows: `winget install python3`
39
+ - Linux (apt): `apt install python-pip`
40
+ - Linux (pacman): `pacman install python-pip`
41
+ 1. Install the [pip] package: `pip install tumblrbot`
42
+ - Alternatively, you can install from this repository: `pip install git+https://github.com/MaidScientistIzutsumiMarin/tumblrbot.git`
43
+ - On Linux, you will have to make a virtual environment or use the flag to install packages system-wide.
44
+ 1. Run `tumblrbot` from anywhere. Run `tumblrbot --help` for command-line options. Every command-line option corresponds to a value from the [config][configurable].
45
+
46
+ ---
47
+
48
+ Description of original project:
49
+ > 4tv-tumblrbot was a collaborative project I embarked on with my close friend Dima, who goes by @smoqueen on Tumblr. The aim of this endeavor was straightforward yet silly: to develop a Tumblr bot powered by a machine-learning model. This bot would be specifically trained on the content from a particular Tumblr blog or a selected set of blogs, allowing it to mimic the style, tone, and thematic essence of the original posts.
50
+
51
+ This fork is largely a rewrite of the source code with similarities in its structure and process.
52
+
53
+ Features:
54
+
55
+ - An [interactive console][Main] for all steps of generating posts for the blog:
56
+ 1. Asks for [OpenAI] and [Tumblr] tokens.
57
+ 1. Retrieves [Tumblr] [OAuth] tokens.
58
+ 1. [Downloads posts][Download] from specified blogs ([configurable]).
59
+ - Skips redownloading already downloaded posts.
60
+ - Shows progress and previews the current post.
61
+ 1. [Creates examples][Examples] to fine-tune the model from the downloaded posts.
62
+ - Filters out posts that contain more than just text data.
63
+ - Filters out posts that contain regular expressions ([configurable]).
64
+ - Only uses the most recent posts from each blog ([configurable]).
65
+ - Adds custom user messages and assistant responses to the dataset ([configurable]).
66
+ 1. Filters out any posts flagged by the [OpenAI Moderation API].
67
+ 1. [Uploads examples][Fine-Tune] to [OpenAI] and begins the fine-tuning process.
68
+ - Provides cost estimates if the currently saved examples are used to fine-tune a base model ([configurable]).
69
+ - Resumes monitoring the same fine-tuning process when restarted.
70
+ - Deletes the uploaded examples file if fine-tuning does not succeed (optional).
71
+ - Stores the output model automatically when fine-tuning is completed.
72
+ 1. [Generates and uploads posts][Generate] to a blog using the fine-tuned model ([configurable]).
73
+ - Creates tags by extracting keywords using the base model ([configurable]).
74
+ - Uploads posts as drafts.
75
+ - Reblogs posts from allowed blogs ([configurable]).
76
+ - Shows progress and previews the current post.
77
+ - Colorful output, progress bars, and post previews using [rich].
78
+ - Automatically keeps the [config][configurable] file up-to-date and recreates it if missing (without overriding user settings).
79
+
80
+ **Known Issues:**
81
+
82
+ - Fine-tuning can fail after the validation phase due to the examples file not passing [OpenAI] moderation checks. There are a few workarounds for this that can be tried in combination:
83
+ - You can retry with the same examples file. This has, on rare occasions, worked.
84
+ - You can submit the examples file to the [OpenAI] moderation API with this program's guided prompts. This has worked consistently for our dataset, but others have reported it not being thorough enough.
85
+ - You can use regular expressions to filter out training data in the [config][configurable]. This is more of a brute-force solution, but it can work if the other solutions do not.
86
+ - You can try limiting your dataset by specifying fewer blogs to download from or limiting the number of posts taken from each one in the [config][configurable].
87
+ - If all else fails, you can manually remove data from the examples file until it passes. It is unfortunately not a definitive resource, but it can help to read about what the [OpenAI moderation API flags][Flags].
88
+ - Post counts can be incorrect when downloading posts. Our tests suggest this is a [Tumblr] API problem that is giving inaccurate numbers, so treat them as estimates.
89
+
90
+ **To-Do:**
91
+
92
+ - Add selection menus to make it easier to pick a particular operation.
93
+ - Add retrying to the fine-tuning process (if necessary).
94
+ - Add an option to filter newest posts by a specific date.
95
+
96
+ **Please submit an issue or contact us for features you want added/reimplemented.**
97
+
98
+ ---
99
+
100
+ ## Obtaining Tokens
101
+
102
+ ### OpenAI
103
+
104
+ API token can be created here: [OpenAI Tokens].
105
+
106
+ 1. Leave everything at the defaults and set `Project` to `Default Project`.
107
+ 1. Press `Create secret key`.
108
+ 1. Press `Copy` to copy the API token to your clipboard.
109
+
110
+ ### Tumblr
111
+
112
+ API tokens can be created here: [Tumblr Tokens].
113
+
114
+ 1. Press `+ Register Application`.
115
+ 1. Enter anything for `Application Name` and `Application Description`.
116
+ 1. Enter any URL for `Application Website` and `Default callback URL`, like `https://example.com`.
117
+ 1. Enter any email address for `Administrative contact email`. It probably doesn't need to be one you have access to.
118
+ 1. Press the checkbox next to `I'm not a robot` and complete the CAPTCHA.
119
+ 1. Press `Register`.
120
+ 1. You now have access to your `consumer key` next to `Oauth Consumer Key`.
121
+ 1. Press `Show secret key` to see your `Consumer Secret`.
122
+
123
+ When running this program, you will be prompted to enter all of these tokens. If something goes wrong while entering the tokens, you can always reset them by running the program again and answering `y` to the relevant prompt.
124
+
125
+ After inputting the [Tumblr] tokens, you will be given a URL that you need to open in your browser. Press `Allow`, then copy and paste the URL of the page you are redirected to into the console.
126
+
127
+ ## Configuration
128
+
129
+ All config options can be found in `config.toml` after running the program once. This will be kept up-to-date if there are changes to the config's format in a future update. This also means it may be worthwhile to double-check the config file after an update. Any changes to the config should be in the changelog for a given version.
130
+
131
+ All file options can include directories that will be created when the program is run.
132
+
133
+ All config options that involve *blog identifiers* expect any version of a blog URL, which is explained in more detail in the [Tumblr API documentation on blog identifiers].
134
+
135
+ A valid post:
136
+
137
+ - Contains any content.
138
+ - Only has text.
139
+ - Is not an ask.
140
+ - Is not a reblog.
141
+
142
+ Specific Options:
143
+
144
+ - `custom_prompts_file` This file should follow the following file format:
145
+
146
+ ```json
147
+ {"user message 1": "assistant response 1"}
148
+ {"user message 1": "assistant response 1"}
149
+ {"user message 2": "assistant response 2", "user message 3": "assistant response 3"}
150
+ ```
151
+
152
+ To be specific, it should follow the [JSON Lines] file format with one collection of name/value pairs (a dictionary) per line. You can validate your file using the [JSON Lines Validator].
153
+
154
+ - **`post_limit`** - At most, this many valid posts will be included in the training data. This effectively is a filter to select the `N` most recent valid posts from each blog. `0` will use every available valid post.
155
+ - **`moderation_batch_size`** - This controls the batch size when submitting posts to the OpenAI moderation. There is no limit, but higher numbers will cause you to be rate-limited more, which can overall be slower. Low numbers reduce rate-limiting, but can sometimes take longer due to needing more requests. The best value will depend on your computer, internet connection, and any number of factors on OpenAI's side. The default value is just what worked best for our computer.
156
+ - **`filtered_words`** - During training data generation, any posts with the specified words will be removed. Word boundaries are not checked by default, so “the” will also filter out posts with “them” or “thematic”. This setting supports regular expressions, so you can explicitly look for word boundaries by surrounding an entry with “\\\b”, i.e., “\\\bthe\\\b”. Regular expressions have to be escaped like so due to how JSON data is read in. If you are familiar with regular expressions, it could be useful for you to know that every entry is joined with a “|” which is then used to search the post content for any matches.
157
+ - **`developer_message`** - This message is used in for fine-tuning the AI as well as generating prompts. If you change this, you will need to run the fine-tuning again with the new value before generating posts.
158
+ - **`user_message`** - This setting is used and works in the same way as `developer_message`.
159
+ - **`expected_epochs`** - The default value here is the default number of epochs for `base_model`. You may have to change this value if you change `base_model`. After running fine-tuning once, you will see the number of epochs used in the [fine-tuning portal] under *Hyperparameters*. This value will also be updated automatically if you run fine-tuning through this program.
160
+ - **`token_price`** - The default value here is the default token price for `base_model`. You can find the up-to-date value in [OpenAI Pricing], in the *Training* column.
161
+ - **`job_id`** - If there is any value here, this program will resume monitoring the corresponding job, instead of starting a new one. This gets set when starting the fine-tuning and is cleared when it is completed. You can read more in [fine-tuning].
162
+ - **`base_model`** - This value is used to choose the tokenizer for estimating fine-tuning costs. It is also the base model that will be fine-tuned and the model that is used to generate tags. You can find a list of options in the [fine-tuning portal] by pressing `+ Create` and opening the drop-down list for `Base Model`. Be sure to update `token_price` if you change this value.
163
+ - **`fine_tuned_model`** - Set automatically after monitoring fine-tuning if the job has succeeded. You can read more in [fine-tuning].
164
+ - **`tags_chance`** - This should be between 0 and 1. Setting it to 0 corresponds to a 0% chance (never) to add tags to a post. 1 corresponds to a 100% chance (always) to add tags to a post. Adding tags incurs a very small token cost.
165
+ - **`reblog_blog_identifiers`** - Whenever a reblog is attempted, a random blog from this list will be chosen to be reblogged from.
166
+ - **`reblog_chance`** - This setting works the same way as `tags_chance`.
167
+ - **`reblog_user_message`** - This setting is a [format string]. The only argument it is formatted with is the content of the post being reblogged. In simple terms, the `{}` will be replaced with said content. Alternatively, you can leave out the `{}` so that the reblogged post is appended to the end.
168
+ - *Note: The bot is only given the latest message in a reblog chain due to the required complexity and added costs of including the entire chain.*
169
+
170
+ ## Manual Fine-Tuning
171
+
172
+ You can manually upload the examples file to [OpenAI] and start the fine-tuning here: [fine-tuning portal].
173
+
174
+ 1. Press `+ Create`.
175
+ 1. Select the desired `Base Model` from the dropdown. This should ideally match the model set in the [config][configurable].
176
+ 1. Upload the generated examples file to the section under `Training data`. You can find the path for this in the [config][configurable].
177
+ 1. Press `Create`.
178
+ 1. (Optional) Copy the value next to `Job ID` and paste it into the [config][configurable] under `job_id`. You can then run the program and monitor its progress as usual.
179
+ 1. If you do not do the above, you will have to copy the value next to `Output model` once the job is complete and paste it into the [config][configurable] under `fine_tuned_model`.
@@ -1,30 +1,27 @@
1
- [project]
2
- name = "tumblrbot"
3
- version = "1.9.7"
4
- description = "An updated bot that posts to Tumblr, based on your very own blog!"
5
- readme = "README.md"
6
- requires-python = ">= 3.14"
7
- dependencies = [
8
- "openai",
9
- "pydantic",
10
- "requests",
11
- "requests-oauthlib",
12
- "rich",
13
- "tenacity",
14
- "tiktoken",
15
- "tomlkit"
16
- ]
17
-
18
- [project.urls]
19
- Funding = "https://ko-fi.com/maidscientistizutsumimarin"
20
- Source = "https://github.com/MaidScientistIzutsumiMarin/tumblrbot"
21
-
22
- [project.scripts]
23
- tumblrbot = "tumblrbot.__main__:main"
24
-
25
- [project.optional-dependencies]
26
- dev = ["pyinstaller"]
27
-
28
- [build-system]
29
- requires = ["flit_core"]
30
- build-backend = "flit_core.buildapi"
1
+ [project]
2
+ name = "tumblrbot"
3
+ version = "1.10.1"
4
+ description = "An updated bot that posts to Tumblr, based on your very own blog!"
5
+ readme = "README.md"
6
+ requires-python = ">= 3.14"
7
+ dependencies = [
8
+ "CurrencyConverter",
9
+ "openai",
10
+ "pydantic",
11
+ "requests",
12
+ "requests-oauthlib",
13
+ "rich",
14
+ "tenacity",
15
+ "tiktoken",
16
+ "tomlkit"
17
+ ]
18
+
19
+ [project.urls]
20
+ Funding = "https://ko-fi.com/maidscientistizutsumimarin"
21
+ Source = "https://github.com/MaidScientistIzutsumiMarin/tumblrbot"
22
+
23
+ [project.scripts]
24
+ tumblrbot = "tumblrbot.__main__:main"
25
+
26
+ [tool.uv]
27
+ package = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -1,43 +1,47 @@
1
- from sys import exit as sys_exit
2
-
3
- from openai import OpenAI
4
- from rich.prompt import Confirm
5
- from rich.traceback import install
6
-
7
- from tumblrbot.flow.download import PostDownloader
8
- from tumblrbot.flow.examples import ExamplesWriter
9
- from tumblrbot.flow.fine_tune import FineTuner
10
- from tumblrbot.flow.generate import DraftGenerator
11
- from tumblrbot.utils.common import FlowClass
12
- from tumblrbot.utils.models import Tokens
13
- from tumblrbot.utils.tumblr import TumblrSession
14
-
15
-
16
- def main() -> None:
17
- install()
18
-
19
- tokens = Tokens.load()
20
- with OpenAI(api_key=tokens.openai_api_key) as openai, TumblrSession(tokens) as tumblr:
21
- if Confirm.ask("Download latest posts?", default=False):
22
- PostDownloader(openai, tumblr).main()
23
-
24
- examples_writer = ExamplesWriter(openai, tumblr)
25
- if Confirm.ask("Create training data?", default=False):
26
- examples_writer.main()
27
-
28
- if Confirm.ask("Remove training data flagged by the OpenAI moderation? [bold]This can sometimes resolve errors with fine-tuning validation, but is slow.", default=False):
29
- examples_writer.filter_examples()
30
-
31
- fine_tuner = FineTuner(openai, tumblr)
32
- fine_tuner.print_estimates()
33
-
34
- message = "Resume monitoring the previous fine-tuning process?" if FlowClass.config.job_id else "Upload data to OpenAI for fine-tuning?"
35
- if Confirm.ask(f"{message} [bold]You must do this to set the model to generate drafts from. Alternatively, manually enter a model into the config", default=False):
36
- fine_tuner.main()
37
-
38
- if Confirm.ask("Generate drafts?", default=False):
39
- DraftGenerator(openai, tumblr).main()
40
-
41
-
42
- if __name__ == "__main__":
43
- sys_exit(main())
1
+ from locale import LC_ALL, setlocale
2
+ from sys import exit as sys_exit
3
+ from sys import maxsize
4
+
5
+ from openai import OpenAI
6
+ from rich.prompt import Confirm
7
+ from rich.traceback import install
8
+
9
+ from tumblrbot.flow.download import PostDownloader
10
+ from tumblrbot.flow.examples import ExamplesWriter
11
+ from tumblrbot.flow.fine_tune import FineTuner
12
+ from tumblrbot.flow.generate import DraftGenerator
13
+ from tumblrbot.utils.common import FlowClass
14
+ from tumblrbot.utils.models import Tokens
15
+ from tumblrbot.utils.tumblr import TumblrSession
16
+
17
+
18
+ def main() -> None:
19
+ setlocale(LC_ALL, "")
20
+
21
+ install()
22
+
23
+ tokens = Tokens.load()
24
+ with OpenAI(api_key=tokens.openai_api_key, max_retries=maxsize) as openai, TumblrSession(tokens) as tumblr:
25
+ if Confirm.ask("Download latest posts?", default=False):
26
+ PostDownloader(openai, tumblr).main()
27
+
28
+ examples_writer = ExamplesWriter(openai, tumblr)
29
+ if Confirm.ask("Create training data?", default=False):
30
+ examples_writer.main()
31
+
32
+ if Confirm.ask("Remove training data flagged by the OpenAI moderation? [bold]This can sometimes resolve errors with fine-tuning validation, but is slow.", default=False):
33
+ examples_writer.filter_examples()
34
+
35
+ fine_tuner = FineTuner(openai, tumblr)
36
+ fine_tuner.print_estimates()
37
+
38
+ message = "Resume monitoring the previous fine-tuning process?" if FlowClass.config.job_id else "Upload data to OpenAI for fine-tuning?"
39
+ if Confirm.ask(f"{message} [bold]You must do this to set the model to generate drafts from. Alternatively, manually enter a model into the config", default=False):
40
+ fine_tuner.main()
41
+
42
+ if Confirm.ask("Generate drafts?", default=False):
43
+ DraftGenerator(openai, tumblr).main()
44
+
45
+
46
+ if __name__ == "__main__":
47
+ sys_exit(main())
@@ -1,55 +1,55 @@
1
- from json import dump
2
- from typing import TYPE_CHECKING, override
3
-
4
- from tumblrbot.utils.common import FlowClass, PreviewLive
5
- from tumblrbot.utils.models import Post
6
-
7
- if TYPE_CHECKING:
8
- from io import TextIOBase
9
-
10
-
11
- class PostDownloader(FlowClass):
12
- @override
13
- def main(self) -> None:
14
- self.config.data_directory.mkdir(parents=True, exist_ok=True)
15
-
16
- with PreviewLive() as live:
17
- for blog_identifier in self.config.download_blog_identifiers:
18
- data_path = self.get_data_path(blog_identifier)
19
-
20
- completed = 0
21
- after = 0
22
- if data_path.exists():
23
- lines = data_path.read_bytes().splitlines() if data_path.exists() else []
24
- completed = len(lines)
25
- if lines:
26
- after = Post.model_validate_json(lines[-1]).timestamp
27
-
28
- with data_path.open("a", encoding="utf_8") as fp:
29
- self.paginate_posts(
30
- blog_identifier,
31
- completed,
32
- after,
33
- fp,
34
- live,
35
- )
36
-
37
- def paginate_posts(self, blog_identifier: str, completed: int, after: int, fp: TextIOBase, live: PreviewLive) -> None:
38
- task_id = live.progress.add_task(f"Downloading posts from '{blog_identifier}'...", total=None, completed=completed)
39
-
40
- while True:
41
- response = self.tumblr.retrieve_published_posts(blog_identifier, after=after)
42
- live.progress.update(task_id, total=response.response.blog.posts, completed=completed)
43
-
44
- if not response.response.posts:
45
- return
46
-
47
- for post in response.response.posts:
48
- dump(post, fp)
49
- fp.write("\n")
50
-
51
- model = Post.model_validate(post)
52
- after = model.timestamp
53
- live.custom_update(model)
54
-
55
- completed += len(response.response.posts)
1
+ from json import dump
2
+ from typing import TYPE_CHECKING, override
3
+
4
+ from tumblrbot.utils.common import FlowClass, PreviewLive
5
+ from tumblrbot.utils.models import Post
6
+
7
+ if TYPE_CHECKING:
8
+ from io import TextIOBase
9
+
10
+
11
+ class PostDownloader(FlowClass):
12
+ @override
13
+ def main(self) -> None:
14
+ self.config.data_directory.mkdir(parents=True, exist_ok=True)
15
+
16
+ with PreviewLive() as live:
17
+ for blog_identifier in self.config.download_blog_identifiers:
18
+ data_path = self.get_data_path(blog_identifier)
19
+
20
+ completed = 0
21
+ after = 0
22
+ if data_path.exists():
23
+ lines = data_path.read_bytes().splitlines() if data_path.exists() else []
24
+ completed = len(lines)
25
+ if lines:
26
+ after = Post.model_validate_json(lines[-1]).timestamp
27
+
28
+ with data_path.open("a", encoding="utf_8") as fp:
29
+ self.paginate_posts(
30
+ blog_identifier,
31
+ completed,
32
+ after,
33
+ fp,
34
+ live,
35
+ )
36
+
37
+ def paginate_posts(self, blog_identifier: str, completed: int, after: int, fp: TextIOBase, live: PreviewLive) -> None:
38
+ task_id = live.progress.add_task(f"Downloading posts from '{blog_identifier}'...", total=None, completed=completed)
39
+
40
+ while True:
41
+ response = self.tumblr.retrieve_published_posts(blog_identifier, after=after)
42
+ live.progress.update(task_id, total=response.response.blog.posts, completed=completed)
43
+
44
+ if not response.response.posts:
45
+ return
46
+
47
+ for post in response.response.posts:
48
+ dump(post, fp)
49
+ fp.write("\n")
50
+
51
+ model = Post.model_validate(post)
52
+ after = model.timestamp
53
+ live.custom_update(model)
54
+
55
+ completed += len(response.response.posts)