mdfb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdfb-0.1.0/LICENSE +21 -0
- mdfb-0.1.0/PKG-INFO +121 -0
- mdfb-0.1.0/README.md +96 -0
- mdfb-0.1.0/mdfb/__init__.py +0 -0
- mdfb-0.1.0/mdfb/core/download_blobs.py +106 -0
- mdfb-0.1.0/mdfb/core/fetch_post_details.py +113 -0
- mdfb-0.1.0/mdfb/core/get_post_identifiers.py +61 -0
- mdfb-0.1.0/mdfb/core/resolve_handle.py +24 -0
- mdfb-0.1.0/mdfb/mdfb.py +100 -0
- mdfb-0.1.0/mdfb/utils/constants.py +7 -0
- mdfb-0.1.0/mdfb/utils/helpers.py +45 -0
- mdfb-0.1.0/mdfb/utils/logging.py +13 -0
- mdfb-0.1.0/mdfb/utils/validation.py +37 -0
- mdfb-0.1.0/pyproject.toml +45 -0
mdfb-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Ibrahim Haji Abdi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
mdfb-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: mdfb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Ibrahim Haji Abdi
|
|
6
|
+
Author-email: ibrahim.hajiabdi09@gmail.com
|
|
7
|
+
Requires-Python: >=3.8,<3.14
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: argparse (>=1.4.0,<2.0.0)
|
|
16
|
+
Requires-Dist: atproto (>=0.0.55,<0.0.56)
|
|
17
|
+
Requires-Dist: pathvalidate (>=3.2.1,<4.0.0)
|
|
18
|
+
Requires-Dist: pytest (>=8.3.4,<9.0.0)
|
|
19
|
+
Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
|
|
20
|
+
Requires-Dist: requests-mock (>=1.12.1,<2.0.0)
|
|
21
|
+
Requires-Dist: tdqm (>=0.0.1,<0.0.2)
|
|
22
|
+
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
23
|
+
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# mdfb-downloader-for-bluesky
|
|
27
|
+
|
|
28
|
+
mass-downloader-for-bluesky (mdfb) is a Python cli application that can download large amounts of posts from bluesky from any given account.
|
|
29
|
+
|
|
30
|
+
## Navigation
|
|
31
|
+
- [mdfb-downloader-for-bluesky](#mdfb-downloader-for-bluesky)
|
|
32
|
+
- [Navigation](#navigation)
|
|
33
|
+
- [Installation](#installation)
|
|
34
|
+
- [Manual](#manual)
|
|
35
|
+
- [Usage](#usage)
|
|
36
|
+
- [Examples](#examples)
|
|
37
|
+
- [Linux](#linux)
|
|
38
|
+
- [Windows](#windows)
|
|
39
|
+
- [Naming Convention](#naming-convention)
|
|
40
|
+
- [Download Amount](#download-amount)
|
|
41
|
+
- [Note](#note)
|
|
42
|
+
- [Options](#options)
|
|
43
|
+
- [Note](#note-1)
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
You will need [Python](https://www.python.org/downloads/) to be installed to use this CLI.
|
|
48
|
+
|
|
49
|
+
You can install via pip by:
|
|
50
|
+
```bash
|
|
51
|
+
pip install mdfb
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Manual
|
|
55
|
+
|
|
56
|
+
Have [Poetry](https://python-poetry.org/) installed.
|
|
57
|
+
|
|
58
|
+
Then clone the project, open a poetry shell and then install all dependencies.
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone git@github.com:IbrahimHajiAbdi/mdfb-downloader-for-bluesky.git
|
|
63
|
+
cd mdfb-downloader-for-bluesky
|
|
64
|
+
poetry shell
|
|
65
|
+
poetry install
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Usage
|
|
69
|
+
``mdfb`` works by using the public API offered by bluesky to retrieve posts liked, reposted or posted by the desired account.
|
|
70
|
+
|
|
71
|
+
``mdfb`` will download the information for a post and the accompanying media, video or image(s). If there is no image(s) or video, it will just download the information of the post. The information of the post will be a JSON file and have lots of accompanying data, such as the text in the post, creation time of the post and author details. Currently, the retrieved posts start from the latest post to the oldest.
|
|
72
|
+
|
|
73
|
+
You will need to be inside a poetry shell to use ``mdfb`` if installed manually
|
|
74
|
+
|
|
75
|
+
### Examples
|
|
76
|
+
|
|
77
|
+
Some example commands would be:
|
|
78
|
+
|
|
79
|
+
#### Linux
|
|
80
|
+
```bash
|
|
81
|
+
mdfb --handle bsky.app -l 10 --like ./media/
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
#### Windows
|
|
85
|
+
```bash
|
|
86
|
+
mdfb --handle bsky.app -l 100 --like --repost --post ./media/
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Naming Convention
|
|
90
|
+
``mdfb``'s naming convention is: ``"{rkey}_{handle}_{text}"``, if it is downloading a post with multiple images then the naming will be: ``"{rkey}_{handle}_{text}_{i}"``, where "i" represents the order of the images in the post ranging from 1 - 4. In addition, the filenames are limited to 256 bytes and will be truncated down to that size.
|
|
91
|
+
|
|
92
|
+
### Download Amount
|
|
93
|
+
When specifying the limit, this will be true for all types of post downloaded. For example:
|
|
94
|
+
```bash
|
|
95
|
+
mdfb --handle bsky.app -l 100 --like --repost --post ./media/
|
|
96
|
+
```
|
|
97
|
+
This would download 100 likes, reposts and post, totalling 300 posts downloaded.
|
|
98
|
+
|
|
99
|
+
### Note
|
|
100
|
+
The maximum number of threads is currently 3, that can be changed in the ``mdfb/utils/constants.py`` file. Furthermore, there are more constants that can be changed in that file, such as delay between each request and the number of retires before marking that post as a failure and continuing.
|
|
101
|
+
|
|
102
|
+
## Options
|
|
103
|
+
- ``--handle``
|
|
104
|
+
- The handle of the target account.
|
|
105
|
+
- ``--did, -d``
|
|
106
|
+
- The DID of the target account.
|
|
107
|
+
- ``--limit, -l``
|
|
108
|
+
- The amount of posts that want to be downloaded. **Required**.
|
|
109
|
+
- ``directory``
|
|
110
|
+
- Positional argument, where all the downloaded files are to be located. **Required**.
|
|
111
|
+
- ``--threads``
|
|
112
|
+
- The amount of threads wanted to download posts more efficiently, maximum number of threads is 3.
|
|
113
|
+
- ``--like``
|
|
114
|
+
- To retrieved liked posts
|
|
115
|
+
- ``--repost``
|
|
116
|
+
- To retrieved reposts
|
|
117
|
+
- ``--post``
|
|
118
|
+
- To retrieved posts
|
|
119
|
+
### Note
|
|
120
|
+
At least one of the flags: ``--like``, ``--repost``, ``--post`` is **required**.
|
|
121
|
+
In addition, ``--did, -d`` and ``--handle`` are mutually exclusive, and at least one of them is **required** as well.
|
mdfb-0.1.0/README.md
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# mdfb-downloader-for-bluesky
|
|
2
|
+
|
|
3
|
+
mass-downloader-for-bluesky (mdfb) is a Python cli application that can download large amounts of posts from bluesky from any given account.
|
|
4
|
+
|
|
5
|
+
## Navigation
|
|
6
|
+
- [mdfb-downloader-for-bluesky](#mdfb-downloader-for-bluesky)
|
|
7
|
+
- [Navigation](#navigation)
|
|
8
|
+
- [Installation](#installation)
|
|
9
|
+
- [Manual](#manual)
|
|
10
|
+
- [Usage](#usage)
|
|
11
|
+
- [Examples](#examples)
|
|
12
|
+
- [Linux](#linux)
|
|
13
|
+
- [Windows](#windows)
|
|
14
|
+
- [Naming Convention](#naming-convention)
|
|
15
|
+
- [Download Amount](#download-amount)
|
|
16
|
+
- [Note](#note)
|
|
17
|
+
- [Options](#options)
|
|
18
|
+
- [Note](#note-1)
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
You will need [Python](https://www.python.org/downloads/) to be installed to use this CLI.
|
|
23
|
+
|
|
24
|
+
You can install via pip by:
|
|
25
|
+
```bash
|
|
26
|
+
pip install mdfb
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Manual
|
|
30
|
+
|
|
31
|
+
Have [Poetry](https://python-poetry.org/) installed.
|
|
32
|
+
|
|
33
|
+
Then clone the project, open a poetry shell and then install all dependencies.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
git clone git@github.com:IbrahimHajiAbdi/mdfb-downloader-for-bluesky.git
|
|
38
|
+
cd mdfb-downloader-for-bluesky
|
|
39
|
+
poetry shell
|
|
40
|
+
poetry install
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
``mdfb`` works by using the public API offered by bluesky to retrieve posts liked, reposted or posted by the desired account.
|
|
45
|
+
|
|
46
|
+
``mdfb`` will download the information for a post and the accompanying media, video or image(s). If there is no image(s) or video, it will just download the information of the post. The information of the post will be a JSON file and have lots of accompanying data, such as the text in the post, creation time of the post and author details. Currently, the retrieved posts start from the latest post to the oldest.
|
|
47
|
+
|
|
48
|
+
You will need to be inside a poetry shell to use ``mdfb`` if installed manually
|
|
49
|
+
|
|
50
|
+
### Examples
|
|
51
|
+
|
|
52
|
+
Some example commands would be:
|
|
53
|
+
|
|
54
|
+
#### Linux
|
|
55
|
+
```bash
|
|
56
|
+
mdfb --handle bsky.app -l 10 --like ./media/
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
#### Windows
|
|
60
|
+
```bash
|
|
61
|
+
mdfb --handle bsky.app -l 100 --like --repost --post ./media/
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Naming Convention
|
|
65
|
+
``mdfb``'s naming convention is: ``"{rkey}_{handle}_{text}"``, if it is downloading a post with multiple images then the naming will be: ``"{rkey}_{handle}_{text}_{i}"``, where "i" represents the order of the images in the post ranging from 1 - 4. In addition, the filenames are limited to 256 bytes and will be truncated down to that size.
|
|
66
|
+
|
|
67
|
+
### Download Amount
|
|
68
|
+
When specifying the limit, this will be true for all types of post downloaded. For example:
|
|
69
|
+
```bash
|
|
70
|
+
mdfb --handle bsky.app -l 100 --like --repost --post ./media/
|
|
71
|
+
```
|
|
72
|
+
This would download 100 likes, reposts and post, totalling 300 posts downloaded.
|
|
73
|
+
|
|
74
|
+
### Note
|
|
75
|
+
The maximum number of threads is currently 3, that can be changed in the ``mdfb/utils/constants.py`` file. Furthermore, there are more constants that can be changed in that file, such as delay between each request and the number of retires before marking that post as a failure and continuing.
|
|
76
|
+
|
|
77
|
+
## Options
|
|
78
|
+
- ``--handle``
|
|
79
|
+
- The handle of the target account.
|
|
80
|
+
- ``--did, -d``
|
|
81
|
+
- The DID of the target account.
|
|
82
|
+
- ``--limit, -l``
|
|
83
|
+
- The amount of posts that want to be downloaded. **Required**.
|
|
84
|
+
- ``directory``
|
|
85
|
+
- Positional argument, where all the downloaded files are to be located. **Required**.
|
|
86
|
+
- ``--threads``
|
|
87
|
+
- The amount of threads wanted to download posts more efficiently, maximum number of threads is 3.
|
|
88
|
+
- ``--like``
|
|
89
|
+
- To retrieved liked posts
|
|
90
|
+
- ``--repost``
|
|
91
|
+
- To retrieved reposts
|
|
92
|
+
- ``--post``
|
|
93
|
+
- To retrieved posts
|
|
94
|
+
### Note
|
|
95
|
+
At least one of the flags: ``--like``, ``--repost``, ``--post`` is **required**.
|
|
96
|
+
In addition, ``--did, -d`` and ``--handle`` are mutually exclusive, and at least one of them is **required** as well.
|
|
File without changes
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from atproto_client.namespaces.sync_ns import ComAtprotoSyncNamespace
|
|
4
|
+
from atproto_client.models.com.atproto.repo.list_records import ParamsDict
|
|
5
|
+
from atproto import Client
|
|
6
|
+
import re, time
|
|
7
|
+
from pathvalidate import sanitize_filename
|
|
8
|
+
import encodings
|
|
9
|
+
import logging
|
|
10
|
+
from mdfb.utils.constants import DELAY, RETRIES, EXP_WAIT_MAX, EXP_WAIT_MIN, EXP_WAIT_MULTIPLIER
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
14
|
+
|
|
15
|
+
def download_blobs(posts: list[dict], file_path: str, progress_bar: tqdm) -> None:
|
|
16
|
+
"""
|
|
17
|
+
download_blobs: for the given posts, returned from fetch_post_details(), and filepath, downloads the associated blobs for each post.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
posts (list[dict]): post details returned from fetch_post_details()
|
|
21
|
+
file_path (str): filepath for where the files will be stored
|
|
22
|
+
progress_bar (tqdm): progress bar
|
|
23
|
+
"""
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
for post in posts:
|
|
26
|
+
did = post["did"]
|
|
27
|
+
filename = _make_base_filename(post["rkey"], post["text"], post["handle"])
|
|
28
|
+
if "video_cid" in post:
|
|
29
|
+
video_filename = _append_extension(filename, post["mime_type"])
|
|
30
|
+
success = _get_blob_with_retries(did, post["video_cid"], video_filename, file_path, logger)
|
|
31
|
+
if success:
|
|
32
|
+
logger.info(f"Successful downloaded video: {video_filename}")
|
|
33
|
+
time.sleep(DELAY)
|
|
34
|
+
|
|
35
|
+
if "images_cid" in post:
|
|
36
|
+
for index ,image_cid in enumerate(post["images_cid"]):
|
|
37
|
+
if len(post["images_cid"]) > 1:
|
|
38
|
+
image_filename = _append_extension(filename, post["mime_type"], index + 1)
|
|
39
|
+
else: image_filename = _append_extension(filename, post["mime_type"])
|
|
40
|
+
success = _get_blob_with_retries(did, image_cid, image_filename, file_path, logger)
|
|
41
|
+
if success:
|
|
42
|
+
logger.info(f"Successful downloaded image: {image_filename}")
|
|
43
|
+
time.sleep(DELAY)
|
|
44
|
+
|
|
45
|
+
with open(f"{os.path.join(file_path, filename)}.json", "wt") as json_file:
|
|
46
|
+
json.dump(post["response"], json_file, indent=4)
|
|
47
|
+
logger.info(f"Sucessful wrote file: {filename + ".json"}")
|
|
48
|
+
progress_bar.update(1)
|
|
49
|
+
|
|
50
|
+
def _get_blob_with_retries(did: str, cid: str, filename: str, file_path: str, logger: logging.Logger):
|
|
51
|
+
try:
|
|
52
|
+
_get_blob(did, cid, filename, file_path, logger)
|
|
53
|
+
return True
|
|
54
|
+
except Exception:
|
|
55
|
+
logger.error(f"Error occured for downloading this file, DID: {did}, CID: {cid}, after {RETRIES} retires", exc_info=True)
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
@retry(
|
|
59
|
+
wait=wait_exponential(multiplier=EXP_WAIT_MULTIPLIER, min=EXP_WAIT_MIN, max=EXP_WAIT_MAX),
|
|
60
|
+
stop=stop_after_attempt(RETRIES)
|
|
61
|
+
)
|
|
62
|
+
def _get_blob(did: str, cid: str, filename: str, file_path: str, logger: logging.Logger) -> bool:
|
|
63
|
+
try:
|
|
64
|
+
res = ComAtprotoSyncNamespace(Client()).get_blob(ParamsDict(
|
|
65
|
+
did=did,
|
|
66
|
+
cid=cid
|
|
67
|
+
))
|
|
68
|
+
with open(os.path.join(file_path, filename), "wb") as file:
|
|
69
|
+
file.write(res)
|
|
70
|
+
except Exception:
|
|
71
|
+
logger.error(f"Error occured for downloading this file, DID: {did}, CID: {cid}")
|
|
72
|
+
raise
|
|
73
|
+
|
|
74
|
+
def _make_base_filename(rkey: str, text: str, handle: str) -> str:
|
|
75
|
+
filename = f"{rkey}_{handle}_{text}"
|
|
76
|
+
filename = _truncate_filename(filename, 245)
|
|
77
|
+
return sanitize_filename(filename)
|
|
78
|
+
|
|
79
|
+
def _append_extension(base_filename: str, mime_type: str = None, i: int = None) -> str:
|
|
80
|
+
filename = base_filename
|
|
81
|
+
if i:
|
|
82
|
+
filename += f"_{i}"
|
|
83
|
+
if mime_type:
|
|
84
|
+
file_type = re.search(r"\w+$", mime_type).group()
|
|
85
|
+
filename += f".{file_type}"
|
|
86
|
+
return filename
|
|
87
|
+
|
|
88
|
+
def _truncate_filename(filename: str, MAX_BYTE: int) -> str:
|
|
89
|
+
"""
|
|
90
|
+
_truncate_filename: truncates the given filename to the maximum number of bytes given, or less. This is only for utf-8 encoded strings and
|
|
91
|
+
if the filename at the maximum number of bytes is an invalid utf-8 string, then it removes one byte from the end so the string is valid.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
filename (str): string of the filename
|
|
95
|
+
MAX_BYTE (int): maximum bytes allowed
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
str: truncated filename such that it is within the maximum number of bytes
|
|
99
|
+
"""
|
|
100
|
+
byte_len = 0
|
|
101
|
+
iter_encoder = encodings.search_function("utf-8").incrementalencoder()
|
|
102
|
+
for i, char in enumerate(filename):
|
|
103
|
+
byte_len += len(iter_encoder.encode(char))
|
|
104
|
+
if byte_len > MAX_BYTE:
|
|
105
|
+
return filename[:i]
|
|
106
|
+
return filename
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from atproto_client.namespaces.sync_ns import AppBskyFeedNamespace
|
|
3
|
+
from atproto_client.models.com.atproto.repo.list_records import ParamsDict
|
|
4
|
+
from atproto import Client
|
|
5
|
+
from atproto.exceptions import AtProtocolError
|
|
6
|
+
import time, json, logging
|
|
7
|
+
|
|
8
|
+
from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
|
|
9
|
+
|
|
10
|
+
from mdfb.utils.helpers import get_chunk
|
|
11
|
+
from mdfb.utils.constants import DELAY, EXP_WAIT_MAX, EXP_WAIT_MIN, EXP_WAIT_MULTIPLIER, RETRIES
|
|
12
|
+
|
|
13
|
+
def fetch_post_details(uris: list[str]) -> list[dict]:
|
|
14
|
+
"""
|
|
15
|
+
fetch_post_details: Fetches post details from the given AT-URIs
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
uris (list[str]): A list of AT-URIs
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
list[dict]: A list of dictionaries that contain post details
|
|
22
|
+
"""
|
|
23
|
+
all_post_details = []
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
seen_uris = set()
|
|
26
|
+
client = Client("https://public.api.bsky.app/")
|
|
27
|
+
|
|
28
|
+
for uri_chunk in get_chunk(uris, 25):
|
|
29
|
+
logger.info(f"Fetching details from {len(uri_chunk)} URIs")
|
|
30
|
+
res = _get_post_details_with_retries(uri_chunk, client, logger)
|
|
31
|
+
if not res:
|
|
32
|
+
continue
|
|
33
|
+
records = json.loads(res.model_dump_json())
|
|
34
|
+
|
|
35
|
+
for post in records["posts"]:
|
|
36
|
+
seen_uris.add(post["uri"])
|
|
37
|
+
post_details = {
|
|
38
|
+
"rkey": _get_rkey(post["uri"]),
|
|
39
|
+
"text": post["record"].get("text", ""),
|
|
40
|
+
"response": post,
|
|
41
|
+
**_get_author_details(post["author"])
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
embed_media = post["record"].get("embed", None)
|
|
45
|
+
if not embed_media:
|
|
46
|
+
all_post_details.append(post_details)
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
embed_media = embed_media.get("media", embed_media)
|
|
50
|
+
post_details.update(_extract_media(embed_media))
|
|
51
|
+
|
|
52
|
+
logger.info("Post details retrieved for URI: %s", post["uri"])
|
|
53
|
+
all_post_details.append(post_details)
|
|
54
|
+
for uri in uri_chunk:
|
|
55
|
+
if uri not in seen_uris:
|
|
56
|
+
logger.info(f"The post associated with this URI is missing/deleted: {uri}")
|
|
57
|
+
time.sleep(DELAY)
|
|
58
|
+
return all_post_details
|
|
59
|
+
|
|
60
|
+
def _extract_media(embed: dict) -> dict:
|
|
61
|
+
"""
|
|
62
|
+
_extract_media: Extracts information from the media, or embede, key in the post details JSON response from the atproto API: app.bsky.feed.getPosts
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
embed (dict): The embed key from the API response of atproto API: app.bsky.feed.getPosts
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
dict: The associated information from embed
|
|
69
|
+
"""
|
|
70
|
+
media_links = {}
|
|
71
|
+
if embed.get("images"):
|
|
72
|
+
for image_obj in embed["images"]:
|
|
73
|
+
image = image_obj["image"]["ref"]["link"]
|
|
74
|
+
if "images_cid" not in media_links:
|
|
75
|
+
media_links["images_cid"] = [image]
|
|
76
|
+
else: media_links["images_cid"].append(image)
|
|
77
|
+
media_links["mime_type"] = image_obj["image"]["mime_type"]
|
|
78
|
+
if embed.get("video"):
|
|
79
|
+
media_links["video_cid"] = embed["video"]["ref"]["link"]
|
|
80
|
+
media_links["mime_type"] = embed["video"]["mime_type"]
|
|
81
|
+
return media_links
|
|
82
|
+
|
|
83
|
+
def _get_post_details_with_retries(uri_chunk: list, client: Client, logger: logging.Logger):
|
|
84
|
+
try:
|
|
85
|
+
return _get_post_details(uri_chunk, client, logger)
|
|
86
|
+
except (RetryError, AtProtocolError):
|
|
87
|
+
logger.error(f"Failure to fetch records from the URIs: {uri_chunk}", exc_info=True)
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
@retry(
|
|
91
|
+
wait=wait_exponential(multiplier=EXP_WAIT_MULTIPLIER, min=EXP_WAIT_MIN, max=EXP_WAIT_MAX),
|
|
92
|
+
stop=stop_after_attempt(RETRIES)
|
|
93
|
+
)
|
|
94
|
+
def _get_post_details(uri_chunk: list, client: Client, logger: logging.Logger):
|
|
95
|
+
try:
|
|
96
|
+
res = AppBskyFeedNamespace(client).get_posts(ParamsDict(
|
|
97
|
+
uris=uri_chunk
|
|
98
|
+
))
|
|
99
|
+
return res
|
|
100
|
+
except (AtProtocolError, RetryError):
|
|
101
|
+
logger.error(f"Error occurred fetching records from URIs: {uri_chunk}")
|
|
102
|
+
raise
|
|
103
|
+
|
|
104
|
+
def _get_rkey(at_uri: str) -> str:
|
|
105
|
+
match = re.search(r"\w+$", at_uri)
|
|
106
|
+
return match.group()
|
|
107
|
+
|
|
108
|
+
def _get_author_details(author: dict) -> dict:
|
|
109
|
+
author_details = {}
|
|
110
|
+
author_details["did"] = author["did"]
|
|
111
|
+
author_details["handle"] = author["handle"]
|
|
112
|
+
author_details["display_name"] = author["display_name"]
|
|
113
|
+
return author_details
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from atproto_client.namespaces.sync_ns import ComAtprotoRepoNamespace
|
|
3
|
+
from atproto_client.models.com.atproto.repo.list_records import ParamsDict
|
|
4
|
+
from atproto import Client
|
|
5
|
+
from atproto.exceptions import AtProtocolError
|
|
6
|
+
import re, time, logging
|
|
7
|
+
|
|
8
|
+
from mdfb.utils.constants import DELAY
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_post_identifiers(did: str, limit: int, feed_type: str) -> list[str]:
|
|
12
|
+
"""
|
|
13
|
+
get_post_identifiers: Gets the given amount AT-URIs of the posts wanted from the desired account
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
did (str): DID of the target account
|
|
17
|
+
limit (int): The amount wanted to get
|
|
18
|
+
feed_type (str): The type of post wanted from the account: like, repost and post
|
|
19
|
+
|
|
20
|
+
Raises:
|
|
21
|
+
SystemExit: If there is a failure to retreive posts
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list[str]: A list of the desired AT-URIs
|
|
25
|
+
"""
|
|
26
|
+
cursor = ""
|
|
27
|
+
post_uris = []
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
client = Client()
|
|
30
|
+
while limit > 0:
|
|
31
|
+
fetch_amount = min(100, limit)
|
|
32
|
+
try:
|
|
33
|
+
logger.info(f"Fetching up to {fetch_amount} posts for DID: {did}, feed_type: {feed_type}")
|
|
34
|
+
res = ComAtprotoRepoNamespace(client).list_records(ParamsDict(
|
|
35
|
+
collection=f"app.bsky.feed.{feed_type}",
|
|
36
|
+
repo=did,
|
|
37
|
+
limit=fetch_amount,
|
|
38
|
+
cursor=cursor,
|
|
39
|
+
))
|
|
40
|
+
res = json.loads(res.model_dump_json())
|
|
41
|
+
except AtProtocolError as e:
|
|
42
|
+
logger.error(f"Failure to fetch posts: {e}", exc_info=True)
|
|
43
|
+
print("Failure to get fetch posts. See logs for details.")
|
|
44
|
+
raise SystemExit(1) from e
|
|
45
|
+
|
|
46
|
+
limit -= fetch_amount
|
|
47
|
+
logger.info("Successful retrieved: %d posts, %d remaining", fetch_amount, limit)
|
|
48
|
+
records = res.get("records", {})
|
|
49
|
+
if not records:
|
|
50
|
+
logger.info(f"No more records to fetch for DID: {did}, feed_type: {feed_type}")
|
|
51
|
+
break
|
|
52
|
+
last_record_cid = re.search(r"\w+$", records[-1]["uri"])[0]
|
|
53
|
+
cursor = last_record_cid
|
|
54
|
+
for record in records:
|
|
55
|
+
if feed_type == "post":
|
|
56
|
+
uri = record["uri"]
|
|
57
|
+
else:
|
|
58
|
+
uri = record["value"]["subject"]["uri"]
|
|
59
|
+
post_uris.append(uri)
|
|
60
|
+
time.sleep(DELAY)
|
|
61
|
+
return post_uris
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from atproto_identity.handle.resolver import HandleResolver
|
|
3
|
+
from atproto_identity.exceptions import DidNotFoundError
|
|
4
|
+
|
|
5
|
+
def resolve_handle(handle: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
resolve_handle: for a given handle, uses atproto API: com.atproto.identity.resolveHandle, to resolve the handle to a DID
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
handle (str): handle of the target account
|
|
11
|
+
|
|
12
|
+
Raises:
|
|
13
|
+
DidNotFoundError: if the handle is able to be resolved
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
str: resolved DID
|
|
17
|
+
"""
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
try:
|
|
20
|
+
did = HandleResolver().ensure_resolve(handle)
|
|
21
|
+
except DidNotFoundError:
|
|
22
|
+
logger.error(f"Unable to resolve handle: {handle}")
|
|
23
|
+
raise DidNotFoundError(f"Unable to resolve handle: {handle}")
|
|
24
|
+
return did
|
mdfb-0.1.0/mdfb/mdfb.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from argparse import ArgumentParser
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
+
|
|
5
|
+
from mdfb.core.get_post_identifiers import get_post_identifiers
|
|
6
|
+
from mdfb.core.fetch_post_details import fetch_post_details
|
|
7
|
+
from mdfb.core.download_blobs import download_blobs
|
|
8
|
+
from mdfb.core.resolve_handle import resolve_handle
|
|
9
|
+
from mdfb.utils.validation import *
|
|
10
|
+
from mdfb.utils.helpers import split_list
|
|
11
|
+
from mdfb.utils.logging import setup_logging
|
|
12
|
+
from mdfb.utils.constants import DEFAULT_THREADS
|
|
13
|
+
|
|
14
|
+
def fetch_posts(did: str, limit: int, post_types: dict) -> list[str]:
|
|
15
|
+
post_uris = []
|
|
16
|
+
for post_type, wanted in post_types.items():
|
|
17
|
+
if wanted:
|
|
18
|
+
post_uris.extend(get_post_identifiers(did, limit, post_type))
|
|
19
|
+
return post_uris
|
|
20
|
+
|
|
21
|
+
def process_posts(posts: list, num_threads: int) -> list[dict]:
|
|
22
|
+
"""
|
|
23
|
+
process_posts: processes the given list of post URIs to get the post details required for downloading, can be threaded
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
posts (list): list of URIs of the post wanted
|
|
27
|
+
num_threads (int): number of threads
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
list[dict]: list of dictionaries that contain post details for each post
|
|
31
|
+
"""
|
|
32
|
+
posts = split_list(posts, num_threads)
|
|
33
|
+
post_details = []
|
|
34
|
+
|
|
35
|
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
|
36
|
+
futures = []
|
|
37
|
+
for post_batch in posts:
|
|
38
|
+
futures.append(executor.submit(fetch_post_details, post_batch))
|
|
39
|
+
for future in as_completed(futures):
|
|
40
|
+
post_details.extend(future.result())
|
|
41
|
+
return post_details
|
|
42
|
+
|
|
43
|
+
def main():
|
|
44
|
+
parser = ArgumentParser()
|
|
45
|
+
|
|
46
|
+
parser.add_argument("directory", action="store", help="Directory for where all downloaded post will be stored")
|
|
47
|
+
parser.add_argument("-l", "--limit", action="store", required=True, help="The number of posts to be downloaded")
|
|
48
|
+
parser.add_argument("--like", action="store_true", help="To retreive liked posts")
|
|
49
|
+
parser.add_argument("--post", action="store_true", help="To retreive posts")
|
|
50
|
+
parser.add_argument("--repost", action="store_true", help="To retreive reposts")
|
|
51
|
+
parser.add_argument("--threads", action="store", help="Number of threads, maximum of 3 threads")
|
|
52
|
+
|
|
53
|
+
group = parser.add_mutually_exclusive_group(required=True)
|
|
54
|
+
group.add_argument("--did", action="store", help="The DID associated with the account")
|
|
55
|
+
group.add_argument("--handle", action="store", help="The handle for the account e.g. johnny.bsky.social")
|
|
56
|
+
|
|
57
|
+
args = parser.parse_args()
|
|
58
|
+
try:
|
|
59
|
+
did = validate_did(args.did) if args.did else resolve_handle(args.handle)
|
|
60
|
+
directory = validate_directory(args.directory)
|
|
61
|
+
limit = validate_limit(args.limit)
|
|
62
|
+
|
|
63
|
+
setup_logging(directory)
|
|
64
|
+
|
|
65
|
+
num_threads = validate_threads(args.threads) if args.threads else DEFAULT_THREADS
|
|
66
|
+
|
|
67
|
+
if not any([args.like, args.post, args.repost]):
|
|
68
|
+
raise ValueError("At least one flag (--like, --post, --repost) must be set.")
|
|
69
|
+
|
|
70
|
+
post_types = {
|
|
71
|
+
"like": args.like,
|
|
72
|
+
"repost": args.repost,
|
|
73
|
+
"post": args.post
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
print("Fetching post identifiers...")
|
|
78
|
+
posts = fetch_posts(did, limit, post_types)
|
|
79
|
+
|
|
80
|
+
wanted_post_types = [post_type for post_type, wanted in post_types.items() if wanted]
|
|
81
|
+
account = args.handle if args.handle else did
|
|
82
|
+
validate_no_posts(posts, account, wanted_post_types)
|
|
83
|
+
|
|
84
|
+
print("Getting post details...")
|
|
85
|
+
post_details = process_posts(posts, num_threads)
|
|
86
|
+
|
|
87
|
+
num_of_posts = len(post_details)
|
|
88
|
+
post_links = split_list(post_details, num_threads)
|
|
89
|
+
|
|
90
|
+
with tqdm(total=num_of_posts, desc="Downloading files") as progress_bar:
|
|
91
|
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
|
92
|
+
futures = []
|
|
93
|
+
for batch_post_link in post_links:
|
|
94
|
+
futures.append(executor.submit(download_blobs, batch_post_link, directory, progress_bar))
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"Error: {e}")
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
main()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
def split_list(input_list: list, split_by: int) -> list[list[str]]:
|
|
2
|
+
"""
|
|
3
|
+
split_list: splits the list into the given number of equal sized chunks, used for distributing data so that it can be used for threads
|
|
4
|
+
|
|
5
|
+
Args:
|
|
6
|
+
input_list (list): input list, length must be >= `split_by`
|
|
7
|
+
split_by (int): number of chunks wanted, must be >= 1
|
|
8
|
+
|
|
9
|
+
Returns:
|
|
10
|
+
list[list[str]]: a 2d array of list split into the desired number of chunks, given by `split_by`
|
|
11
|
+
"""
|
|
12
|
+
if split_by < 1:
|
|
13
|
+
raise ValueError("Please enter split_by to be greater than 0")
|
|
14
|
+
part_size, remainder = divmod(len(input_list), split_by)
|
|
15
|
+
|
|
16
|
+
res = []
|
|
17
|
+
start = 0
|
|
18
|
+
for _ in range(split_by):
|
|
19
|
+
end = start + part_size
|
|
20
|
+
if remainder > 0:
|
|
21
|
+
end += 1
|
|
22
|
+
remainder -= 1
|
|
23
|
+
res.append(input_list[start:end])
|
|
24
|
+
start = end
|
|
25
|
+
return res
|
|
26
|
+
|
|
27
|
+
def get_chunk(posts: list, chunk_size: int):
|
|
28
|
+
"""
|
|
29
|
+
get_chunk: splits a list into smaller chunks of a specified size.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
posts (list): the list to be divided into chunks.
|
|
33
|
+
chunk_size (int): the size of each chunk. Must be >= 1.
|
|
34
|
+
|
|
35
|
+
Yields:
|
|
36
|
+
list: a sublist containing up to `chunk_size` elements from the original list.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If `chunk_size` is less than 1.
|
|
40
|
+
"""
|
|
41
|
+
if chunk_size < 1:
|
|
42
|
+
raise ValueError("Please enter a chunk size >= 1")
|
|
43
|
+
for i in range(0, len(posts), chunk_size):
|
|
44
|
+
chunk = posts[i:i+chunk_size]
|
|
45
|
+
yield chunk
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
def setup_logging(directory: str):
|
|
6
|
+
log_name = datetime.datetime.now().strftime("mdfb_%d%m%Y_%H%M%S.log")
|
|
7
|
+
logging.basicConfig(
|
|
8
|
+
filename=os.path.join(directory, log_name),
|
|
9
|
+
encoding='utf-8',
|
|
10
|
+
level=logging.INFO,
|
|
11
|
+
format='[%(asctime)s] %(message)s',
|
|
12
|
+
datefmt='%m/%d/%Y %I:%M:%S %p',
|
|
13
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from mdfb.utils.constants import MAX_THREADS
|
|
5
|
+
|
|
6
|
+
def validate_directory(directory: str) -> str:
|
|
7
|
+
if not os.path.exists(directory) or not os.path.isdir(directory):
|
|
8
|
+
raise ValueError("The given filepath is either not valid or does not exist")
|
|
9
|
+
return directory.rstrip("/")
|
|
10
|
+
|
|
11
|
+
def validate_limit(limit: str) -> int:
|
|
12
|
+
if not limit.isdigit():
|
|
13
|
+
raise ValueError("The given limit is not a integer")
|
|
14
|
+
elif int(limit) < 1:
|
|
15
|
+
raise ValueError("The given limit is 0 or less")
|
|
16
|
+
return int(limit)
|
|
17
|
+
|
|
18
|
+
def validate_did(did: str) -> str:
|
|
19
|
+
if not re.search(r"^did:[a-z]+:[a-zA-Z0-9._:%-]*[a-zA-Z0-9._-]$", did):
|
|
20
|
+
raise ValueError("The given DID is not valid")
|
|
21
|
+
return did
|
|
22
|
+
|
|
23
|
+
def validate_threads(threads: str) -> int:
|
|
24
|
+
if not threads.isdigit():
|
|
25
|
+
raise ValueError("Please enter an integer")
|
|
26
|
+
threads = int(threads)
|
|
27
|
+
if threads > MAX_THREADS:
|
|
28
|
+
logging.info(f"Entered {threads} threads, but the maximum is {MAX_THREADS}. Setting to {MAX_THREADS} threads")
|
|
29
|
+
print(f"Entered {threads} threads, but the maximum is {MAX_THREADS}. Setting to {MAX_THREADS} threads.")
|
|
30
|
+
threads = MAX_THREADS
|
|
31
|
+
if threads < 1:
|
|
32
|
+
raise ValueError("Please set threads to 1 or more")
|
|
33
|
+
return threads
|
|
34
|
+
|
|
35
|
+
def validate_no_posts(posts: list, account: str, post_types: list):
|
|
36
|
+
if not posts:
|
|
37
|
+
raise ValueError(f"There are no posts associated with account: {account}, for post_type(s): {post_types}")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "mdfb"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = ["Ibrahim Haji Abdi <ibrahim.hajiabdi09@gmail.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
|
|
8
|
+
[tool.poetry.scripts]
|
|
9
|
+
mdfb = "mdfb.mdfb:main"
|
|
10
|
+
|
|
11
|
+
[tool.poetry.dependencies]
|
|
12
|
+
python = "<3.14,>=3.8"
|
|
13
|
+
atproto = "^0.0.55"
|
|
14
|
+
tdqm = "^0.0.1"
|
|
15
|
+
argparse = "^1.4.0"
|
|
16
|
+
pathvalidate = "^3.2.1"
|
|
17
|
+
pytest = "^8.3.4"
|
|
18
|
+
tqdm = "^4.67.1"
|
|
19
|
+
requests-mock = "^1.12.1"
|
|
20
|
+
pytest-mock = "^3.14.0"
|
|
21
|
+
tenacity = "^9.0.0"
|
|
22
|
+
|
|
23
|
+
[project]
|
|
24
|
+
name = "mdfb"
|
|
25
|
+
version = "0.1.0"
|
|
26
|
+
authors = [
|
|
27
|
+
{ name="Ibrahim Haji Abdi", email="ibrahim.hajiabdi09@gmail.com" },
|
|
28
|
+
]
|
|
29
|
+
description = "A CLI for downloading posts in bulk from Bluesky from specified a account"
|
|
30
|
+
readme = "README.md"
|
|
31
|
+
requires-python = "<3.14,>=3.8"
|
|
32
|
+
classifiers = [
|
|
33
|
+
"Programming Language :: Python :: 3",
|
|
34
|
+
"License :: OSI Approved :: MIT License",
|
|
35
|
+
"Operating System :: OS Independent",
|
|
36
|
+
"Environment :: Console"
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://github.com/IbrahimHajiAbdi/mass-downloader-for-bluesky"
|
|
41
|
+
Issues = "https://github.com/IbrahimHajiAbdi/mass-downloader-for-bluesky/issues"
|
|
42
|
+
|
|
43
|
+
[build-system]
|
|
44
|
+
requires = ["poetry-core"]
|
|
45
|
+
build-backend = "poetry.core.masonry.api"
|