mdfb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdfb-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ibrahim Haji Abdi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
mdfb-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.1
2
+ Name: mdfb
3
+ Version: 0.1.0
4
+ Summary:
5
+ Author: Ibrahim Haji Abdi
6
+ Author-email: ibrahim.hajiabdi09@gmail.com
7
+ Requires-Python: >=3.8,<3.14
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.8
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: argparse (>=1.4.0,<2.0.0)
16
+ Requires-Dist: atproto (>=0.0.55,<0.0.56)
17
+ Requires-Dist: pathvalidate (>=3.2.1,<4.0.0)
18
+ Requires-Dist: pytest (>=8.3.4,<9.0.0)
19
+ Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
20
+ Requires-Dist: requests-mock (>=1.12.1,<2.0.0)
21
+ Requires-Dist: tdqm (>=0.0.1,<0.0.2)
22
+ Requires-Dist: tenacity (>=9.0.0,<10.0.0)
23
+ Requires-Dist: tqdm (>=4.67.1,<5.0.0)
24
+ Description-Content-Type: text/markdown
25
+
26
+ # mdfb-downloader-for-bluesky
27
+
28
+ mass-downloader-for-bluesky (mdfb) is a Python cli application that can download large amounts of posts from bluesky from any given account.
29
+
30
+ ## Navigation
31
+ - [mdfb-downloader-for-bluesky](#mdfb-downloader-for-bluesky)
32
+ - [Navigation](#navigation)
33
+ - [Installation](#installation)
34
+ - [Manual](#manual)
35
+ - [Usage](#usage)
36
+ - [Examples](#examples)
37
+ - [Linux](#linux)
38
+ - [Windows](#windows)
39
+ - [Naming Convention](#naming-convention)
40
+ - [Download Amount](#download-amount)
41
+ - [Note](#note)
42
+ - [Options](#options)
43
+ - [Note](#note-1)
44
+
45
+ ## Installation
46
+
47
+ You will need [Python](https://www.python.org/downloads/) to be installed to use this CLI.
48
+
49
+ You can install via pip by:
50
+ ```bash
51
+ pip install mdfb
52
+ ```
53
+
54
+ ### Manual
55
+
56
+ Have [Poetry](https://python-poetry.org/) installed.
57
+
58
+ Then clone the project, open a poetry shell and then install all dependencies.
59
+
60
+
61
+ ```bash
62
+ git clone git@github.com:IbrahimHajiAbdi/mdfb-downloader-for-bluesky.git
63
+ cd mdfb-downloader-for-bluesky
64
+ poetry shell
65
+ poetry install
66
+ ```
67
+
68
+ ## Usage
69
+ ``mdfb`` works by using the public API offered by bluesky to retrieve posts liked, reposted or posted by the desired account.
70
+
71
+ ``mdfb`` will download the information for a post and the accompanying media, video or image(s). If there is no image(s) or video, it will just download the information of the post. The information of the post will be a JSON file and have lots of accompanying data, such as the text in the post, creation time of the post and author details. Currently, the retrieved posts start from the latest post to the oldest.
72
+
73
+ You will need to be inside a poetry shell to use ``mdfb`` if installed manually
74
+
75
+ ### Examples
76
+
77
+ Some example commands would be:
78
+
79
+ #### Linux
80
+ ```bash
81
+ mdfb --handle bsky.app -l 10 --like ./media/
82
+ ```
83
+
84
+ #### Windows
85
+ ```bash
86
+ mdfb --handle bsky.app -l 100 --like --repost --post ./media/
87
+ ```
88
+
89
+ ### Naming Convention
90
+ ``mdfb``'s naming convention is: ``"{rkey}_{handle}_{text}"``, if it is downloading a post with multiple images then the naming will be: ``"{rkey}_{handle}_{text}_{i}"``, where "i" represents the order of the images in the post ranging from 1 - 4. In addition, the filenames are limited to 256 bytes and will be truncated down to that size.
91
+
92
+ ### Download Amount
93
+ When specifying the limit, this will be true for all types of post downloaded. For example:
94
+ ```bash
95
+ mdfb --handle bsky.app -l 100 --like --repost --post ./media/
96
+ ```
97
+ This would download 100 likes, reposts and post, totalling 300 posts downloaded.
98
+
99
+ ### Note
100
+ The maximum number of threads is currently 3, that can be changed in the ``mdfb/utils/constants.py`` file. Furthermore, there are more constants that can be changed in that file, such as delay between each request and the number of retires before marking that post as a failure and continuing.
101
+
102
+ ## Options
103
+ - ``--handle``
104
+ - The handle of the target account.
105
+ - ``--did, -d``
106
+ - The DID of the target account.
107
+ - ``--limit, -l``
108
+ - The amount of posts that want to be downloaded. **Required**.
109
+ - ``directory``
110
+ - Positional argument, where all the downloaded files are to be located. **Required**.
111
+ - ``--threads``
112
+ - The amount of threads wanted to download posts more efficiently, maximum number of threads is 3.
113
+ - ``--like``
114
+ - To retrieved liked posts
115
+ - ``--repost``
116
+ - To retrieved reposts
117
+ - ``--post``
118
+ - To retrieved posts
119
+ ### Note
120
+ At least one of the flags: ``--like``, ``--repost``, ``--post`` is **required**.
121
+ In addition, ``--did, -d`` and ``--handle`` are mutually exclusive, and at least one of them is **required** as well.
mdfb-0.1.0/README.md ADDED
@@ -0,0 +1,96 @@
1
+ # mdfb-downloader-for-bluesky
2
+
3
+ mass-downloader-for-bluesky (mdfb) is a Python cli application that can download large amounts of posts from bluesky from any given account.
4
+
5
+ ## Navigation
6
+ - [mdfb-downloader-for-bluesky](#mdfb-downloader-for-bluesky)
7
+ - [Navigation](#navigation)
8
+ - [Installation](#installation)
9
+ - [Manual](#manual)
10
+ - [Usage](#usage)
11
+ - [Examples](#examples)
12
+ - [Linux](#linux)
13
+ - [Windows](#windows)
14
+ - [Naming Convention](#naming-convention)
15
+ - [Download Amount](#download-amount)
16
+ - [Note](#note)
17
+ - [Options](#options)
18
+ - [Note](#note-1)
19
+
20
+ ## Installation
21
+
22
+ You will need [Python](https://www.python.org/downloads/) to be installed to use this CLI.
23
+
24
+ You can install via pip by:
25
+ ```bash
26
+ pip install mdfb
27
+ ```
28
+
29
+ ### Manual
30
+
31
+ Have [Poetry](https://python-poetry.org/) installed.
32
+
33
+ Then clone the project, open a poetry shell and then install all dependencies.
34
+
35
+
36
+ ```bash
37
+ git clone git@github.com:IbrahimHajiAbdi/mdfb-downloader-for-bluesky.git
38
+ cd mdfb-downloader-for-bluesky
39
+ poetry shell
40
+ poetry install
41
+ ```
42
+
43
+ ## Usage
44
+ ``mdfb`` works by using the public API offered by bluesky to retrieve posts liked, reposted or posted by the desired account.
45
+
46
+ ``mdfb`` will download the information for a post and the accompanying media, video or image(s). If there is no image(s) or video, it will just download the information of the post. The information of the post will be a JSON file and have lots of accompanying data, such as the text in the post, creation time of the post and author details. Currently, the retrieved posts start from the latest post to the oldest.
47
+
48
+ You will need to be inside a poetry shell to use ``mdfb`` if installed manually
49
+
50
+ ### Examples
51
+
52
+ Some example commands would be:
53
+
54
+ #### Linux
55
+ ```bash
56
+ mdfb --handle bsky.app -l 10 --like ./media/
57
+ ```
58
+
59
+ #### Windows
60
+ ```bash
61
+ mdfb --handle bsky.app -l 100 --like --repost --post ./media/
62
+ ```
63
+
64
+ ### Naming Convention
65
+ ``mdfb``'s naming convention is: ``"{rkey}_{handle}_{text}"``, if it is downloading a post with multiple images then the naming will be: ``"{rkey}_{handle}_{text}_{i}"``, where "i" represents the order of the images in the post ranging from 1 - 4. In addition, the filenames are limited to 256 bytes and will be truncated down to that size.
66
+
67
+ ### Download Amount
68
+ When specifying the limit, this will be true for all types of post downloaded. For example:
69
+ ```bash
70
+ mdfb --handle bsky.app -l 100 --like --repost --post ./media/
71
+ ```
72
+ This would download 100 likes, reposts and post, totalling 300 posts downloaded.
73
+
74
+ ### Note
75
+ The maximum number of threads is currently 3, that can be changed in the ``mdfb/utils/constants.py`` file. Furthermore, there are more constants that can be changed in that file, such as delay between each request and the number of retires before marking that post as a failure and continuing.
76
+
77
+ ## Options
78
+ - ``--handle``
79
+ - The handle of the target account.
80
+ - ``--did, -d``
81
+ - The DID of the target account.
82
+ - ``--limit, -l``
83
+ - The amount of posts that want to be downloaded. **Required**.
84
+ - ``directory``
85
+ - Positional argument, where all the downloaded files are to be located. **Required**.
86
+ - ``--threads``
87
+ - The amount of threads wanted to download posts more efficiently, maximum number of threads is 3.
88
+ - ``--like``
89
+ - To retrieved liked posts
90
+ - ``--repost``
91
+ - To retrieved reposts
92
+ - ``--post``
93
+ - To retrieved posts
94
+ ### Note
95
+ At least one of the flags: ``--like``, ``--repost``, ``--post`` is **required**.
96
+ In addition, ``--did, -d`` and ``--handle`` are mutually exclusive, and at least one of them is **required** as well.
File without changes
@@ -0,0 +1,106 @@
1
+ import json
2
+ import os
3
+ from atproto_client.namespaces.sync_ns import ComAtprotoSyncNamespace
4
+ from atproto_client.models.com.atproto.repo.list_records import ParamsDict
5
+ from atproto import Client
6
+ import re, time
7
+ from pathvalidate import sanitize_filename
8
+ import encodings
9
+ import logging
10
+ from mdfb.utils.constants import DELAY, RETRIES, EXP_WAIT_MAX, EXP_WAIT_MIN, EXP_WAIT_MULTIPLIER
11
+ from tqdm import tqdm
12
+
13
+ from tenacity import retry, stop_after_attempt, wait_exponential
14
+
15
+ def download_blobs(posts: list[dict], file_path: str, progress_bar: tqdm) -> None:
16
+ """
17
+ download_blobs: for the given posts, returned from fetch_post_details(), and filepath, downloads the associated blobs for each post.
18
+
19
+ Args:
20
+ posts (list[dict]): post details returned from fetch_post_details()
21
+ file_path (str): filepath for where the files will be stored
22
+ progress_bar (tqdm): progress bar
23
+ """
24
+ logger = logging.getLogger(__name__)
25
+ for post in posts:
26
+ did = post["did"]
27
+ filename = _make_base_filename(post["rkey"], post["text"], post["handle"])
28
+ if "video_cid" in post:
29
+ video_filename = _append_extension(filename, post["mime_type"])
30
+ success = _get_blob_with_retries(did, post["video_cid"], video_filename, file_path, logger)
31
+ if success:
32
+ logger.info(f"Successful downloaded video: {video_filename}")
33
+ time.sleep(DELAY)
34
+
35
+ if "images_cid" in post:
36
+ for index ,image_cid in enumerate(post["images_cid"]):
37
+ if len(post["images_cid"]) > 1:
38
+ image_filename = _append_extension(filename, post["mime_type"], index + 1)
39
+ else: image_filename = _append_extension(filename, post["mime_type"])
40
+ success = _get_blob_with_retries(did, image_cid, image_filename, file_path, logger)
41
+ if success:
42
+ logger.info(f"Successful downloaded image: {image_filename}")
43
+ time.sleep(DELAY)
44
+
45
+ with open(f"{os.path.join(file_path, filename)}.json", "wt") as json_file:
46
+ json.dump(post["response"], json_file, indent=4)
47
+ logger.info(f"Sucessful wrote file: {filename + ".json"}")
48
+ progress_bar.update(1)
49
+
50
+ def _get_blob_with_retries(did: str, cid: str, filename: str, file_path: str, logger: logging.Logger):
51
+ try:
52
+ _get_blob(did, cid, filename, file_path, logger)
53
+ return True
54
+ except Exception:
55
+ logger.error(f"Error occured for downloading this file, DID: {did}, CID: {cid}, after {RETRIES} retires", exc_info=True)
56
+ return False
57
+
58
+ @retry(
59
+ wait=wait_exponential(multiplier=EXP_WAIT_MULTIPLIER, min=EXP_WAIT_MIN, max=EXP_WAIT_MAX),
60
+ stop=stop_after_attempt(RETRIES)
61
+ )
62
+ def _get_blob(did: str, cid: str, filename: str, file_path: str, logger: logging.Logger) -> bool:
63
+ try:
64
+ res = ComAtprotoSyncNamespace(Client()).get_blob(ParamsDict(
65
+ did=did,
66
+ cid=cid
67
+ ))
68
+ with open(os.path.join(file_path, filename), "wb") as file:
69
+ file.write(res)
70
+ except Exception:
71
+ logger.error(f"Error occured for downloading this file, DID: {did}, CID: {cid}")
72
+ raise
73
+
74
+ def _make_base_filename(rkey: str, text: str, handle: str) -> str:
75
+ filename = f"{rkey}_{handle}_{text}"
76
+ filename = _truncate_filename(filename, 245)
77
+ return sanitize_filename(filename)
78
+
79
+ def _append_extension(base_filename: str, mime_type: str = None, i: int = None) -> str:
80
+ filename = base_filename
81
+ if i:
82
+ filename += f"_{i}"
83
+ if mime_type:
84
+ file_type = re.search(r"\w+$", mime_type).group()
85
+ filename += f".{file_type}"
86
+ return filename
87
+
88
+ def _truncate_filename(filename: str, MAX_BYTE: int) -> str:
89
+ """
90
+ _truncate_filename: truncates the given filename to the maximum number of bytes given, or less. This is only for utf-8 encoded strings and
91
+ if the filename at the maximum number of bytes is an invalid utf-8 string, then it removes one byte from the end so the string is valid.
92
+
93
+ Args:
94
+ filename (str): string of the filename
95
+ MAX_BYTE (int): maximum bytes allowed
96
+
97
+ Returns:
98
+ str: truncated filename such that it is within the maximum number of bytes
99
+ """
100
+ byte_len = 0
101
+ iter_encoder = encodings.search_function("utf-8").incrementalencoder()
102
+ for i, char in enumerate(filename):
103
+ byte_len += len(iter_encoder.encode(char))
104
+ if byte_len > MAX_BYTE:
105
+ return filename[:i]
106
+ return filename
@@ -0,0 +1,113 @@
1
+ import re
2
+ from atproto_client.namespaces.sync_ns import AppBskyFeedNamespace
3
+ from atproto_client.models.com.atproto.repo.list_records import ParamsDict
4
+ from atproto import Client
5
+ from atproto.exceptions import AtProtocolError
6
+ import time, json, logging
7
+
8
+ from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
9
+
10
+ from mdfb.utils.helpers import get_chunk
11
+ from mdfb.utils.constants import DELAY, EXP_WAIT_MAX, EXP_WAIT_MIN, EXP_WAIT_MULTIPLIER, RETRIES
12
+
13
+ def fetch_post_details(uris: list[str]) -> list[dict]:
14
+ """
15
+ fetch_post_details: Fetches post details from the given AT-URIs
16
+
17
+ Args:
18
+ uris (list[str]): A list of AT-URIs
19
+
20
+ Returns:
21
+ list[dict]: A list of dictionaries that contain post details
22
+ """
23
+ all_post_details = []
24
+ logger = logging.getLogger(__name__)
25
+ seen_uris = set()
26
+ client = Client("https://public.api.bsky.app/")
27
+
28
+ for uri_chunk in get_chunk(uris, 25):
29
+ logger.info(f"Fetching details from {len(uri_chunk)} URIs")
30
+ res = _get_post_details_with_retries(uri_chunk, client, logger)
31
+ if not res:
32
+ continue
33
+ records = json.loads(res.model_dump_json())
34
+
35
+ for post in records["posts"]:
36
+ seen_uris.add(post["uri"])
37
+ post_details = {
38
+ "rkey": _get_rkey(post["uri"]),
39
+ "text": post["record"].get("text", ""),
40
+ "response": post,
41
+ **_get_author_details(post["author"])
42
+ }
43
+
44
+ embed_media = post["record"].get("embed", None)
45
+ if not embed_media:
46
+ all_post_details.append(post_details)
47
+ continue
48
+
49
+ embed_media = embed_media.get("media", embed_media)
50
+ post_details.update(_extract_media(embed_media))
51
+
52
+ logger.info("Post details retrieved for URI: %s", post["uri"])
53
+ all_post_details.append(post_details)
54
+ for uri in uri_chunk:
55
+ if uri not in seen_uris:
56
+ logger.info(f"The post associated with this URI is missing/deleted: {uri}")
57
+ time.sleep(DELAY)
58
+ return all_post_details
59
+
60
+ def _extract_media(embed: dict) -> dict:
61
+ """
62
+ _extract_media: Extracts information from the media, or embede, key in the post details JSON response from the atproto API: app.bsky.feed.getPosts
63
+
64
+ Args:
65
+ embed (dict): The embed key from the API response of atproto API: app.bsky.feed.getPosts
66
+
67
+ Returns:
68
+ dict: The associated information from embed
69
+ """
70
+ media_links = {}
71
+ if embed.get("images"):
72
+ for image_obj in embed["images"]:
73
+ image = image_obj["image"]["ref"]["link"]
74
+ if "images_cid" not in media_links:
75
+ media_links["images_cid"] = [image]
76
+ else: media_links["images_cid"].append(image)
77
+ media_links["mime_type"] = image_obj["image"]["mime_type"]
78
+ if embed.get("video"):
79
+ media_links["video_cid"] = embed["video"]["ref"]["link"]
80
+ media_links["mime_type"] = embed["video"]["mime_type"]
81
+ return media_links
82
+
83
+ def _get_post_details_with_retries(uri_chunk: list, client: Client, logger: logging.Logger):
84
+ try:
85
+ return _get_post_details(uri_chunk, client, logger)
86
+ except (RetryError, AtProtocolError):
87
+ logger.error(f"Failure to fetch records from the URIs: {uri_chunk}", exc_info=True)
88
+ pass
89
+
90
+ @retry(
91
+ wait=wait_exponential(multiplier=EXP_WAIT_MULTIPLIER, min=EXP_WAIT_MIN, max=EXP_WAIT_MAX),
92
+ stop=stop_after_attempt(RETRIES)
93
+ )
94
+ def _get_post_details(uri_chunk: list, client: Client, logger: logging.Logger):
95
+ try:
96
+ res = AppBskyFeedNamespace(client).get_posts(ParamsDict(
97
+ uris=uri_chunk
98
+ ))
99
+ return res
100
+ except (AtProtocolError, RetryError):
101
+ logger.error(f"Error occurred fetching records from URIs: {uri_chunk}")
102
+ raise
103
+
104
+ def _get_rkey(at_uri: str) -> str:
105
+ match = re.search(r"\w+$", at_uri)
106
+ return match.group()
107
+
108
+ def _get_author_details(author: dict) -> dict:
109
+ author_details = {}
110
+ author_details["did"] = author["did"]
111
+ author_details["handle"] = author["handle"]
112
+ author_details["display_name"] = author["display_name"]
113
+ return author_details
@@ -0,0 +1,61 @@
1
+ import json
2
+ from atproto_client.namespaces.sync_ns import ComAtprotoRepoNamespace
3
+ from atproto_client.models.com.atproto.repo.list_records import ParamsDict
4
+ from atproto import Client
5
+ from atproto.exceptions import AtProtocolError
6
+ import re, time, logging
7
+
8
+ from mdfb.utils.constants import DELAY
9
+
10
+
11
+ def get_post_identifiers(did: str, limit: int, feed_type: str) -> list[str]:
12
+ """
13
+ get_post_identifiers: Gets the given amount AT-URIs of the posts wanted from the desired account
14
+
15
+ Args:
16
+ did (str): DID of the target account
17
+ limit (int): The amount wanted to get
18
+ feed_type (str): The type of post wanted from the account: like, repost and post
19
+
20
+ Raises:
21
+ SystemExit: If there is a failure to retreive posts
22
+
23
+ Returns:
24
+ list[str]: A list of the desired AT-URIs
25
+ """
26
+ cursor = ""
27
+ post_uris = []
28
+ logger = logging.getLogger(__name__)
29
+ client = Client()
30
+ while limit > 0:
31
+ fetch_amount = min(100, limit)
32
+ try:
33
+ logger.info(f"Fetching up to {fetch_amount} posts for DID: {did}, feed_type: {feed_type}")
34
+ res = ComAtprotoRepoNamespace(client).list_records(ParamsDict(
35
+ collection=f"app.bsky.feed.{feed_type}",
36
+ repo=did,
37
+ limit=fetch_amount,
38
+ cursor=cursor,
39
+ ))
40
+ res = json.loads(res.model_dump_json())
41
+ except AtProtocolError as e:
42
+ logger.error(f"Failure to fetch posts: {e}", exc_info=True)
43
+ print("Failure to get fetch posts. See logs for details.")
44
+ raise SystemExit(1) from e
45
+
46
+ limit -= fetch_amount
47
+ logger.info("Successful retrieved: %d posts, %d remaining", fetch_amount, limit)
48
+ records = res.get("records", {})
49
+ if not records:
50
+ logger.info(f"No more records to fetch for DID: {did}, feed_type: {feed_type}")
51
+ break
52
+ last_record_cid = re.search(r"\w+$", records[-1]["uri"])[0]
53
+ cursor = last_record_cid
54
+ for record in records:
55
+ if feed_type == "post":
56
+ uri = record["uri"]
57
+ else:
58
+ uri = record["value"]["subject"]["uri"]
59
+ post_uris.append(uri)
60
+ time.sleep(DELAY)
61
+ return post_uris
@@ -0,0 +1,24 @@
1
+ import logging
2
+ from atproto_identity.handle.resolver import HandleResolver
3
+ from atproto_identity.exceptions import DidNotFoundError
4
+
5
+ def resolve_handle(handle: str) -> str:
6
+ """
7
+ resolve_handle: for a given handle, uses atproto API: com.atproto.identity.resolveHandle, to resolve the handle to a DID
8
+
9
+ Args:
10
+ handle (str): handle of the target account
11
+
12
+ Raises:
13
+ DidNotFoundError: if the handle is able to be resolved
14
+
15
+ Returns:
16
+ str: resolved DID
17
+ """
18
+ logger = logging.getLogger(__name__)
19
+ try:
20
+ did = HandleResolver().ensure_resolve(handle)
21
+ except DidNotFoundError:
22
+ logger.error(f"Unable to resolve handle: {handle}")
23
+ raise DidNotFoundError(f"Unable to resolve handle: {handle}")
24
+ return did
@@ -0,0 +1,100 @@
1
+ from argparse import ArgumentParser
2
+ from tqdm import tqdm
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+
5
+ from mdfb.core.get_post_identifiers import get_post_identifiers
6
+ from mdfb.core.fetch_post_details import fetch_post_details
7
+ from mdfb.core.download_blobs import download_blobs
8
+ from mdfb.core.resolve_handle import resolve_handle
9
+ from mdfb.utils.validation import *
10
+ from mdfb.utils.helpers import split_list
11
+ from mdfb.utils.logging import setup_logging
12
+ from mdfb.utils.constants import DEFAULT_THREADS
13
+
14
+ def fetch_posts(did: str, limit: int, post_types: dict) -> list[str]:
15
+ post_uris = []
16
+ for post_type, wanted in post_types.items():
17
+ if wanted:
18
+ post_uris.extend(get_post_identifiers(did, limit, post_type))
19
+ return post_uris
20
+
21
+ def process_posts(posts: list, num_threads: int) -> list[dict]:
22
+ """
23
+ process_posts: processes the given list of post URIs to get the post details required for downloading, can be threaded
24
+
25
+ Args:
26
+ posts (list): list of URIs of the post wanted
27
+ num_threads (int): number of threads
28
+
29
+ Returns:
30
+ list[dict]: list of dictionaries that contain post details for each post
31
+ """
32
+ posts = split_list(posts, num_threads)
33
+ post_details = []
34
+
35
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
36
+ futures = []
37
+ for post_batch in posts:
38
+ futures.append(executor.submit(fetch_post_details, post_batch))
39
+ for future in as_completed(futures):
40
+ post_details.extend(future.result())
41
+ return post_details
42
+
43
+ def main():
44
+ parser = ArgumentParser()
45
+
46
+ parser.add_argument("directory", action="store", help="Directory for where all downloaded post will be stored")
47
+ parser.add_argument("-l", "--limit", action="store", required=True, help="The number of posts to be downloaded")
48
+ parser.add_argument("--like", action="store_true", help="To retreive liked posts")
49
+ parser.add_argument("--post", action="store_true", help="To retreive posts")
50
+ parser.add_argument("--repost", action="store_true", help="To retreive reposts")
51
+ parser.add_argument("--threads", action="store", help="Number of threads, maximum of 3 threads")
52
+
53
+ group = parser.add_mutually_exclusive_group(required=True)
54
+ group.add_argument("--did", action="store", help="The DID associated with the account")
55
+ group.add_argument("--handle", action="store", help="The handle for the account e.g. johnny.bsky.social")
56
+
57
+ args = parser.parse_args()
58
+ try:
59
+ did = validate_did(args.did) if args.did else resolve_handle(args.handle)
60
+ directory = validate_directory(args.directory)
61
+ limit = validate_limit(args.limit)
62
+
63
+ setup_logging(directory)
64
+
65
+ num_threads = validate_threads(args.threads) if args.threads else DEFAULT_THREADS
66
+
67
+ if not any([args.like, args.post, args.repost]):
68
+ raise ValueError("At least one flag (--like, --post, --repost) must be set.")
69
+
70
+ post_types = {
71
+ "like": args.like,
72
+ "repost": args.repost,
73
+ "post": args.post
74
+ }
75
+
76
+
77
+ print("Fetching post identifiers...")
78
+ posts = fetch_posts(did, limit, post_types)
79
+
80
+ wanted_post_types = [post_type for post_type, wanted in post_types.items() if wanted]
81
+ account = args.handle if args.handle else did
82
+ validate_no_posts(posts, account, wanted_post_types)
83
+
84
+ print("Getting post details...")
85
+ post_details = process_posts(posts, num_threads)
86
+
87
+ num_of_posts = len(post_details)
88
+ post_links = split_list(post_details, num_threads)
89
+
90
+ with tqdm(total=num_of_posts, desc="Downloading files") as progress_bar:
91
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
92
+ futures = []
93
+ for batch_post_link in post_links:
94
+ futures.append(executor.submit(download_blobs, batch_post_link, directory, progress_bar))
95
+
96
+ except Exception as e:
97
+ print(f"Error: {e}")
98
+
99
+ if __name__ == "__main__":
100
+ main()
@@ -0,0 +1,7 @@
1
+ MAX_THREADS = 3
2
+ DELAY = 0.25 # in seconds
3
+ DEFAULT_THREADS = 1
4
+ RETRIES = 5
5
+ EXP_WAIT_MULTIPLIER = 1
6
+ EXP_WAIT_MAX = 16
7
+ EXP_WAIT_MIN = 0.5
@@ -0,0 +1,45 @@
1
+ def split_list(input_list: list, split_by: int) -> list[list[str]]:
2
+ """
3
+ split_list: splits the list into the given number of equal sized chunks, used for distributing data so that it can be used for threads
4
+
5
+ Args:
6
+ input_list (list): input list, length must be >= `split_by`
7
+ split_by (int): number of chunks wanted, must be >= 1
8
+
9
+ Returns:
10
+ list[list[str]]: a 2d array of list split into the desired number of chunks, given by `split_by`
11
+ """
12
+ if split_by < 1:
13
+ raise ValueError("Please enter split_by to be greater than 0")
14
+ part_size, remainder = divmod(len(input_list), split_by)
15
+
16
+ res = []
17
+ start = 0
18
+ for _ in range(split_by):
19
+ end = start + part_size
20
+ if remainder > 0:
21
+ end += 1
22
+ remainder -= 1
23
+ res.append(input_list[start:end])
24
+ start = end
25
+ return res
26
+
27
+ def get_chunk(posts: list, chunk_size: int):
28
+ """
29
+ get_chunk: splits a list into smaller chunks of a specified size.
30
+
31
+ Args:
32
+ posts (list): the list to be divided into chunks.
33
+ chunk_size (int): the size of each chunk. Must be >= 1.
34
+
35
+ Yields:
36
+ list: a sublist containing up to `chunk_size` elements from the original list.
37
+
38
+ Raises:
39
+ ValueError: If `chunk_size` is less than 1.
40
+ """
41
+ if chunk_size < 1:
42
+ raise ValueError("Please enter a chunk size >= 1")
43
+ for i in range(0, len(posts), chunk_size):
44
+ chunk = posts[i:i+chunk_size]
45
+ yield chunk
@@ -0,0 +1,13 @@
1
+ import datetime
2
+ import logging
3
+ import os
4
+
5
+ def setup_logging(directory: str):
6
+ log_name = datetime.datetime.now().strftime("mdfb_%d%m%Y_%H%M%S.log")
7
+ logging.basicConfig(
8
+ filename=os.path.join(directory, log_name),
9
+ encoding='utf-8',
10
+ level=logging.INFO,
11
+ format='[%(asctime)s] %(message)s',
12
+ datefmt='%m/%d/%Y %I:%M:%S %p',
13
+ )
@@ -0,0 +1,37 @@
1
+ import logging
2
+ import os
3
+ import re
4
+ from mdfb.utils.constants import MAX_THREADS
5
+
6
+ def validate_directory(directory: str) -> str:
7
+ if not os.path.exists(directory) or not os.path.isdir(directory):
8
+ raise ValueError("The given filepath is either not valid or does not exist")
9
+ return directory.rstrip("/")
10
+
11
+ def validate_limit(limit: str) -> int:
12
+ if not limit.isdigit():
13
+ raise ValueError("The given limit is not a integer")
14
+ elif int(limit) < 1:
15
+ raise ValueError("The given limit is 0 or less")
16
+ return int(limit)
17
+
18
+ def validate_did(did: str) -> str:
19
+ if not re.search(r"^did:[a-z]+:[a-zA-Z0-9._:%-]*[a-zA-Z0-9._-]$", did):
20
+ raise ValueError("The given DID is not valid")
21
+ return did
22
+
23
+ def validate_threads(threads: str) -> int:
24
+ if not threads.isdigit():
25
+ raise ValueError("Please enter an integer")
26
+ threads = int(threads)
27
+ if threads > MAX_THREADS:
28
+ logging.info(f"Entered {threads} threads, but the maximum is {MAX_THREADS}. Setting to {MAX_THREADS} threads")
29
+ print(f"Entered {threads} threads, but the maximum is {MAX_THREADS}. Setting to {MAX_THREADS} threads.")
30
+ threads = MAX_THREADS
31
+ if threads < 1:
32
+ raise ValueError("Please set threads to 1 or more")
33
+ return threads
34
+
35
+ def validate_no_posts(posts: list, account: str, post_types: list):
36
+ if not posts:
37
+ raise ValueError(f"There are no posts associated with account: {account}, for post_type(s): {post_types}")
@@ -0,0 +1,45 @@
1
+ [tool.poetry]
2
+ name = "mdfb"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Ibrahim Haji Abdi <ibrahim.hajiabdi09@gmail.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.scripts]
9
+ mdfb = "mdfb.mdfb:main"
10
+
11
+ [tool.poetry.dependencies]
12
+ python = "<3.14,>=3.8"
13
+ atproto = "^0.0.55"
14
+ tdqm = "^0.0.1"
15
+ argparse = "^1.4.0"
16
+ pathvalidate = "^3.2.1"
17
+ pytest = "^8.3.4"
18
+ tqdm = "^4.67.1"
19
+ requests-mock = "^1.12.1"
20
+ pytest-mock = "^3.14.0"
21
+ tenacity = "^9.0.0"
22
+
23
+ [project]
24
+ name = "mdfb"
25
+ version = "0.1.0"
26
+ authors = [
27
+ { name="Ibrahim Haji Abdi", email="ibrahim.hajiabdi09@gmail.com" },
28
+ ]
29
+ description = "A CLI for downloading posts in bulk from Bluesky from specified a account"
30
+ readme = "README.md"
31
+ requires-python = "<3.14,>=3.8"
32
+ classifiers = [
33
+ "Programming Language :: Python :: 3",
34
+ "License :: OSI Approved :: MIT License",
35
+ "Operating System :: OS Independent",
36
+ "Environment :: Console"
37
+ ]
38
+
39
+ [project.urls]
40
+ Homepage = "https://github.com/IbrahimHajiAbdi/mass-downloader-for-bluesky"
41
+ Issues = "https://github.com/IbrahimHajiAbdi/mass-downloader-for-bluesky/issues"
42
+
43
+ [build-system]
44
+ requires = ["poetry-core"]
45
+ build-backend = "poetry.core.masonry.api"