instagram-archiver 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instagram_archiver/__init__.py +10 -0
- instagram_archiver/__main__.py +7 -0
- instagram_archiver/client.py +290 -0
- instagram_archiver/constants.py +66 -0
- instagram_archiver/main.py +131 -0
- instagram_archiver/profile_scraper.py +214 -0
- instagram_archiver/py.typed +0 -0
- instagram_archiver/saved_scraper.py +85 -0
- instagram_archiver/typing.py +195 -0
- instagram_archiver/utils.py +96 -0
- instagram_archiver-0.3.3.dist-info/METADATA +118 -0
- instagram_archiver-0.3.3.dist-info/RECORD +15 -0
- instagram_archiver-0.3.3.dist-info/WHEEL +4 -0
- instagram_archiver-0.3.3.dist-info/entry_points.txt +4 -0
- instagram_archiver-0.3.3.dist-info/licenses/LICENSE.txt +18 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Instagram archiver."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .client import InstagramClient
|
|
6
|
+
from .profile_scraper import ProfileScraper
|
|
7
|
+
from .saved_scraper import SavedScraper
|
|
8
|
+
|
|
9
|
+
__all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
|
|
10
|
+
__version__ = 'v0.3.3'
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""Generic client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from http import HTTPStatus
|
|
6
|
+
from os import utime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
from requests import HTTPError
|
|
13
|
+
from yt_dlp_utils import setup_session
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
from .constants import API_HEADERS, SHARED_HEADERS
|
|
17
|
+
from .typing import (
|
|
18
|
+
CarouselMedia,
|
|
19
|
+
Comments,
|
|
20
|
+
Edge,
|
|
21
|
+
HighlightsTray,
|
|
22
|
+
MediaInfo,
|
|
23
|
+
MediaInfoItem,
|
|
24
|
+
MediaInfoItemImageVersions2Candidate,
|
|
25
|
+
)
|
|
26
|
+
from .utils import get_extension, json_dumps_formatted, write_if_new
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from collections.abc import Iterable, Mapping
|
|
30
|
+
from types import TracebackType
|
|
31
|
+
|
|
32
|
+
from .typing import BrowserName
|
|
33
|
+
|
|
34
|
+
__all__ = ('CSRFTokenNotFound', 'InstagramClient', 'UnexpectedRedirect')
|
|
35
|
+
|
|
36
|
+
T = TypeVar('T')
|
|
37
|
+
log = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class CSRFTokenNotFound(RuntimeError):
|
|
41
|
+
"""CSRF token not found in cookies."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class UnexpectedRedirect(RuntimeError):
|
|
45
|
+
"""Unexpected redirect in a request."""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class InstagramClient:
|
|
49
|
+
"""Generic client for Instagram."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
|
|
52
|
+
"""
|
|
53
|
+
Initialise the client.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
browser : str
|
|
58
|
+
The browser to use.
|
|
59
|
+
|
|
60
|
+
browser_profile : str
|
|
61
|
+
The browser profile to use.
|
|
62
|
+
"""
|
|
63
|
+
self.session = setup_session(
|
|
64
|
+
browser,
|
|
65
|
+
browser_profile,
|
|
66
|
+
SHARED_HEADERS,
|
|
67
|
+
domains={'instagram.com'},
|
|
68
|
+
status_forcelist=(413, 429, 500, 502, 503, 504),
|
|
69
|
+
)
|
|
70
|
+
self.failed_urls: set[str] = set()
|
|
71
|
+
"""Set of failed URLs."""
|
|
72
|
+
self.video_urls: list[str] = []
|
|
73
|
+
"""List of video URLs to download."""
|
|
74
|
+
|
|
75
|
+
def add_video_url(self, url: str) -> None:
|
|
76
|
+
"""Add a video URL to the list of video URLs."""
|
|
77
|
+
log.info('Added video URL: %s', url)
|
|
78
|
+
self.video_urls.append(url)
|
|
79
|
+
|
|
80
|
+
def add_csrf_token_header(self) -> None:
|
|
81
|
+
"""
|
|
82
|
+
Add CSRF token header to the session.
|
|
83
|
+
|
|
84
|
+
Raises
|
|
85
|
+
------
|
|
86
|
+
CSRFTokenNotFound
|
|
87
|
+
If the CSRF token is not found in the cookies.
|
|
88
|
+
"""
|
|
89
|
+
token = self.session.cookies.get('csrftoken')
|
|
90
|
+
if not token:
|
|
91
|
+
raise CSRFTokenNotFound
|
|
92
|
+
self.session.headers.update({'x-csrftoken': token})
|
|
93
|
+
|
|
94
|
+
def graphql_query(
|
|
95
|
+
self,
|
|
96
|
+
variables: Mapping[str, Any],
|
|
97
|
+
*,
|
|
98
|
+
cast_to: type[T], # noqa: ARG002
|
|
99
|
+
doc_id: str = '9806959572732215',
|
|
100
|
+
) -> T | None:
|
|
101
|
+
"""Make a GraphQL query."""
|
|
102
|
+
with self.session.post(
|
|
103
|
+
'https://www.instagram.com/graphql/query',
|
|
104
|
+
headers={
|
|
105
|
+
'content-type': 'application/x-www-form-urlencoded',
|
|
106
|
+
}
|
|
107
|
+
| API_HEADERS,
|
|
108
|
+
data={'doc_id': doc_id, 'variables': json.dumps(variables, separators=(',', ':'))},
|
|
109
|
+
) as r:
|
|
110
|
+
if r.status_code != HTTPStatus.OK:
|
|
111
|
+
return None
|
|
112
|
+
data = r.json()
|
|
113
|
+
assert isinstance(data, dict)
|
|
114
|
+
if (status := data.get('status')) != 'ok':
|
|
115
|
+
log.error('GraphQL status not "ok": %s', status)
|
|
116
|
+
return None
|
|
117
|
+
if data.get('errors'):
|
|
118
|
+
log.warning('Response has errors.')
|
|
119
|
+
log.debug('Response: %s', json.dumps(data, indent=2))
|
|
120
|
+
if not data.get('data'):
|
|
121
|
+
log.error('No data in response.')
|
|
122
|
+
return cast('T', data['data'])
|
|
123
|
+
|
|
124
|
+
def get_text(self, url: str, *, params: Mapping[str, str] | None = None) -> str:
|
|
125
|
+
"""Get text from a URL."""
|
|
126
|
+
with self.session.get(url, params=params, headers=API_HEADERS) as r:
|
|
127
|
+
r.raise_for_status()
|
|
128
|
+
return r.text
|
|
129
|
+
|
|
130
|
+
def highlights_tray(self, user_id: int | str) -> HighlightsTray:
|
|
131
|
+
"""Get the highlights tray data for a user."""
|
|
132
|
+
return self.get_json(
|
|
133
|
+
f'https://i.instagram.com/api/v1/highlights/{user_id}/highlights_tray/',
|
|
134
|
+
cast_to=HighlightsTray,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def __enter__(self) -> Self: # pragma: no cover
|
|
138
|
+
"""Recommended way to initialise the client."""
|
|
139
|
+
return self
|
|
140
|
+
|
|
141
|
+
def __exit__(
|
|
142
|
+
self, _: type[BaseException] | None, __: BaseException | None, ___: TracebackType | None
|
|
143
|
+
) -> None:
|
|
144
|
+
"""Clean up."""
|
|
145
|
+
|
|
146
|
+
def is_saved(self, url: str) -> bool: # pragma: no cover # noqa: ARG002, PLR6301
|
|
147
|
+
"""Check if a URL is already saved."""
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
def save_to_log(self, url: str) -> None:
|
|
151
|
+
"""Save a URL to the log."""
|
|
152
|
+
|
|
153
|
+
def save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem, timestamp: int) -> None:
|
|
154
|
+
"""Save images in the image_versions2 dictionary."""
|
|
155
|
+
|
|
156
|
+
def key(x: MediaInfoItemImageVersions2Candidate) -> int:
|
|
157
|
+
return x['width'] * x['height']
|
|
158
|
+
|
|
159
|
+
best = max(sub_item['image_versions2']['candidates'], key=key)
|
|
160
|
+
if self.is_saved(best['url']):
|
|
161
|
+
return
|
|
162
|
+
r = self.session.head(best['url'])
|
|
163
|
+
if r.status_code != HTTPStatus.OK:
|
|
164
|
+
log.warning('HEAD request failed with status code %s.', r.status_code)
|
|
165
|
+
return
|
|
166
|
+
ext = get_extension(r.headers['content-type'])
|
|
167
|
+
name = f'{sub_item["id"]}.{ext}'
|
|
168
|
+
with Path(name).open('wb') as f:
|
|
169
|
+
f.writelines(self.session.get(best['url'], stream=True).iter_content(chunk_size=512))
|
|
170
|
+
utime(name, (timestamp, timestamp))
|
|
171
|
+
self.save_to_log(r.url)
|
|
172
|
+
|
|
173
|
+
def save_comments(self, edge: Edge) -> None:
|
|
174
|
+
"""Save comments for an edge node."""
|
|
175
|
+
comment_url = f'https://www.instagram.com/api/v1/media/{edge["node"]["id"]}/comments/'
|
|
176
|
+
shared_params = {'can_support_threading': 'true'}
|
|
177
|
+
try:
|
|
178
|
+
comment_data = self.get_json(
|
|
179
|
+
comment_url,
|
|
180
|
+
params={**shared_params, 'permalink_enabled': 'false'},
|
|
181
|
+
cast_to=Comments,
|
|
182
|
+
)
|
|
183
|
+
except HTTPError:
|
|
184
|
+
log.exception('Failed to get comments.')
|
|
185
|
+
return
|
|
186
|
+
top_comment_data: Any = comment_data
|
|
187
|
+
while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
|
|
188
|
+
try:
|
|
189
|
+
comment_data = self.get_json(
|
|
190
|
+
comment_url,
|
|
191
|
+
params={
|
|
192
|
+
**shared_params,
|
|
193
|
+
'min_id': comment_data['next_min_id'],
|
|
194
|
+
},
|
|
195
|
+
cast_to=Comments,
|
|
196
|
+
)
|
|
197
|
+
except HTTPError:
|
|
198
|
+
log.exception('Failed to get comments.')
|
|
199
|
+
break
|
|
200
|
+
top_comment_data['comments'] = list(top_comment_data['comments']) + list(
|
|
201
|
+
comment_data['comments']
|
|
202
|
+
)
|
|
203
|
+
comments_json = f'{edge["node"]["id"]}-comments.json'
|
|
204
|
+
with Path(comments_json).open('w+', encoding='utf-8') as f:
|
|
205
|
+
json.dump(top_comment_data, f, sort_keys=True, indent=2)
|
|
206
|
+
|
|
207
|
+
def save_media(self, edge: Edge) -> None:
|
|
208
|
+
"""
|
|
209
|
+
Save media for an edge node.
|
|
210
|
+
|
|
211
|
+
Raises
|
|
212
|
+
------
|
|
213
|
+
UnexpectedRedirect
|
|
214
|
+
If a redirect occurs unexpectedly.
|
|
215
|
+
"""
|
|
216
|
+
media_info_url = f'https://www.instagram.com/api/v1/media/{edge["node"]["pk"]}/info/'
|
|
217
|
+
log.info('Saving media at URL: %s', media_info_url)
|
|
218
|
+
if self.is_saved(media_info_url):
|
|
219
|
+
return
|
|
220
|
+
r = self.session.get(media_info_url, headers=API_HEADERS, allow_redirects=False)
|
|
221
|
+
if r.status_code != HTTPStatus.OK:
|
|
222
|
+
if r.status_code in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}:
|
|
223
|
+
raise UnexpectedRedirect
|
|
224
|
+
log.warning('GET request failed with status code %s.', r.status_code)
|
|
225
|
+
log.debug('Content: %s', r.text)
|
|
226
|
+
return
|
|
227
|
+
if 'image_versions2' not in r.text or 'taken_at' not in r.text:
|
|
228
|
+
log.warning('Invalid response. image_versions2 dict not found.')
|
|
229
|
+
return
|
|
230
|
+
media_info: MediaInfo = r.json()
|
|
231
|
+
timestamp = media_info['items'][0]['taken_at']
|
|
232
|
+
id_json_file = f'{edge["node"]["id"]}.json'
|
|
233
|
+
media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
|
|
234
|
+
write_if_new(id_json_file, str(json_dumps_formatted(edge['node'])))
|
|
235
|
+
write_if_new(media_info_json_file, str(json_dumps_formatted(media_info)))
|
|
236
|
+
for file in (id_json_file, media_info_json_file):
|
|
237
|
+
utime(file, (timestamp, timestamp))
|
|
238
|
+
self.save_to_log(media_info_url)
|
|
239
|
+
for item in media_info['items']:
|
|
240
|
+
timestamp = item['taken_at']
|
|
241
|
+
if carousel_media := item.get('carousel_media'):
|
|
242
|
+
for sub_item in carousel_media:
|
|
243
|
+
self.save_image_versions2(sub_item, timestamp)
|
|
244
|
+
elif 'image_versions2' in item:
|
|
245
|
+
self.save_image_versions2(item, timestamp)
|
|
246
|
+
|
|
247
|
+
def save_edges(self, edges: Iterable[Edge], parent_edge: Edge | None = None) -> None:
|
|
248
|
+
"""Save edge node media."""
|
|
249
|
+
for edge in edges:
|
|
250
|
+
if edge['node']['__typename'] == 'XDTMediaDict':
|
|
251
|
+
try:
|
|
252
|
+
shortcode = edge['node']['code']
|
|
253
|
+
except KeyError:
|
|
254
|
+
if parent_edge:
|
|
255
|
+
try:
|
|
256
|
+
shortcode = parent_edge['node']['code']
|
|
257
|
+
except KeyError:
|
|
258
|
+
log.exception('Unknown shortcode.')
|
|
259
|
+
return
|
|
260
|
+
else:
|
|
261
|
+
log.exception('Unknown shortcode.')
|
|
262
|
+
if edge['node'].get('video_dash_manifest'):
|
|
263
|
+
self.add_video_url(f'https://www.instagram.com/p/{shortcode}/')
|
|
264
|
+
else:
|
|
265
|
+
try:
|
|
266
|
+
self.save_comments(edge)
|
|
267
|
+
self.save_media(edge)
|
|
268
|
+
except requests.exceptions.RetryError:
|
|
269
|
+
log.exception('Retries exhausted.')
|
|
270
|
+
return
|
|
271
|
+
else:
|
|
272
|
+
log.warning( # type: ignore[unreachable]
|
|
273
|
+
'Unknown type: `%s`. Item %s will not be processed.',
|
|
274
|
+
edge['node']['__typename'],
|
|
275
|
+
edge['node']['id'],
|
|
276
|
+
)
|
|
277
|
+
shortcode = edge['node']['code']
|
|
278
|
+
self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}/')
|
|
279
|
+
|
|
280
|
+
def get_json(
|
|
281
|
+
self,
|
|
282
|
+
url: str,
|
|
283
|
+
*,
|
|
284
|
+
cast_to: type[T], # noqa: ARG002
|
|
285
|
+
params: Mapping[str, str] | None = None,
|
|
286
|
+
) -> T:
|
|
287
|
+
"""Get JSON data from a URL."""
|
|
288
|
+
with self.session.get(url, params=params, headers=API_HEADERS) as r:
|
|
289
|
+
r.raise_for_status()
|
|
290
|
+
return cast('T', r.json())
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Constants."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = ('API_HEADERS', 'BROWSER_CHOICES', 'PAGE_FETCH_HEADERS', 'SHARED_HEADERS', 'USER_AGENT')
|
|
6
|
+
|
|
7
|
+
USER_AGENT = (
|
|
8
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
9
|
+
'Chrome/137.0.0.0 Safari/537.36'
|
|
10
|
+
)
|
|
11
|
+
"""
|
|
12
|
+
User agent.
|
|
13
|
+
|
|
14
|
+
:meta hide-value:
|
|
15
|
+
"""
|
|
16
|
+
SHARED_HEADERS = {
|
|
17
|
+
'accept': '*/*',
|
|
18
|
+
'authority': 'www.instagram.com',
|
|
19
|
+
'cache-control': 'no-cache',
|
|
20
|
+
'dnt': '1',
|
|
21
|
+
'pragma': 'no-cache',
|
|
22
|
+
'user-agent': USER_AGENT,
|
|
23
|
+
# 'x-asbd-id': '359341',
|
|
24
|
+
# 'x-ig-app-id': '936619743392459',
|
|
25
|
+
}
|
|
26
|
+
"""
|
|
27
|
+
Headers to use for requests.
|
|
28
|
+
|
|
29
|
+
:meta hide-value:
|
|
30
|
+
"""
|
|
31
|
+
API_HEADERS = {
|
|
32
|
+
'x-asbd-id': '359341',
|
|
33
|
+
'x-ig-app-id': '936619743392459',
|
|
34
|
+
}
|
|
35
|
+
"""
|
|
36
|
+
Headers to use for API requests.
|
|
37
|
+
|
|
38
|
+
:meta hide-value:
|
|
39
|
+
"""
|
|
40
|
+
PAGE_FETCH_HEADERS = {
|
|
41
|
+
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,'
|
|
42
|
+
'image/apng,*/*;q=0.8',
|
|
43
|
+
'dpr': '1.5',
|
|
44
|
+
'sec-fetch-mode': 'navigate', # Definitely required.
|
|
45
|
+
'viewport-width': '3840',
|
|
46
|
+
}
|
|
47
|
+
"""
|
|
48
|
+
Headers to use for fetching HTML pages.
|
|
49
|
+
|
|
50
|
+
:meta hide-value:
|
|
51
|
+
"""
|
|
52
|
+
LOG_SCHEMA = """CREATE TABLE log (
|
|
53
|
+
url TEXT PRIMARY KEY NOT NULL,
|
|
54
|
+
date TEXT DEFAULT CURRENT_TIMESTAMP NOT NULL
|
|
55
|
+
);"""
|
|
56
|
+
"""
|
|
57
|
+
Schema for log database.
|
|
58
|
+
|
|
59
|
+
:meta hide-value:
|
|
60
|
+
"""
|
|
61
|
+
BROWSER_CHOICES = ('brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'firefox', 'safari')
|
|
62
|
+
"""
|
|
63
|
+
Possible browser choices to get cookies from.
|
|
64
|
+
|
|
65
|
+
:meta hide-value:
|
|
66
|
+
"""
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Main application."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from bascom import setup_logging
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
from .client import UnexpectedRedirect
|
|
12
|
+
from .constants import BROWSER_CHOICES
|
|
13
|
+
from .profile_scraper import ProfileScraper
|
|
14
|
+
from .saved_scraper import SavedScraper
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from .typing import BrowserName
|
|
18
|
+
|
|
19
|
+
__all__ = ('main',)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@click.command(context_settings={'help_option_names': ('-h', '--help')})
|
|
23
|
+
@click.option(
|
|
24
|
+
'-o',
|
|
25
|
+
'--output-dir',
|
|
26
|
+
default='%(username)s',
|
|
27
|
+
help='Output directory.',
|
|
28
|
+
type=click.Path(file_okay=False, writable=True),
|
|
29
|
+
)
|
|
30
|
+
@click.option(
|
|
31
|
+
'-b',
|
|
32
|
+
'--browser',
|
|
33
|
+
default='chrome',
|
|
34
|
+
type=click.Choice(BROWSER_CHOICES),
|
|
35
|
+
help='Browser to read cookies from.',
|
|
36
|
+
)
|
|
37
|
+
@click.option('-p', '--profile', default='Default', help='Browser profile.')
|
|
38
|
+
@click.option('-d', '--debug', is_flag=True, help='Enable debug output.')
|
|
39
|
+
@click.option('--no-log', is_flag=True, help='Ignore log (re-fetch everything).')
|
|
40
|
+
@click.option(
|
|
41
|
+
'-C',
|
|
42
|
+
'--include-comments',
|
|
43
|
+
is_flag=True,
|
|
44
|
+
help='Also download all comments (extends download time significantly).',
|
|
45
|
+
)
|
|
46
|
+
@click.argument('username')
|
|
47
|
+
def main(
|
|
48
|
+
output_dir: str,
|
|
49
|
+
username: str,
|
|
50
|
+
browser: BrowserName = 'chrome',
|
|
51
|
+
profile: str = 'Default',
|
|
52
|
+
*,
|
|
53
|
+
debug: bool = False,
|
|
54
|
+
include_comments: bool = False,
|
|
55
|
+
no_log: bool = False,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""Archive a profile's posts.""" # noqa: DOC501
|
|
58
|
+
setup_logging(
|
|
59
|
+
debug=debug, loggers={'instagram_archiver': {'handlers': ('console',), 'propagate': False}}
|
|
60
|
+
)
|
|
61
|
+
try:
|
|
62
|
+
with ProfileScraper(
|
|
63
|
+
browser=browser,
|
|
64
|
+
browser_profile=profile,
|
|
65
|
+
comments=include_comments,
|
|
66
|
+
disable_log=no_log,
|
|
67
|
+
output_dir=(
|
|
68
|
+
Path(output_dir % {'username': username})
|
|
69
|
+
if '%(username)s' in output_dir
|
|
70
|
+
else Path(output_dir)
|
|
71
|
+
),
|
|
72
|
+
username=username,
|
|
73
|
+
) as client:
|
|
74
|
+
client.process()
|
|
75
|
+
except UnexpectedRedirect as e:
|
|
76
|
+
click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
|
|
77
|
+
raise click.Abort from e
|
|
78
|
+
except Exception as e:
|
|
79
|
+
if isinstance(e, KeyboardInterrupt) or debug:
|
|
80
|
+
raise
|
|
81
|
+
click.echo('Run with --debug for more information.', err=True)
|
|
82
|
+
raise click.Abort from e
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@click.command(context_settings={'help_option_names': ('-h', '--help')})
|
|
86
|
+
@click.option(
|
|
87
|
+
'-o',
|
|
88
|
+
'--output-dir',
|
|
89
|
+
default='.',
|
|
90
|
+
help='Output directory.',
|
|
91
|
+
type=click.Path(file_okay=False, writable=True),
|
|
92
|
+
)
|
|
93
|
+
@click.option(
|
|
94
|
+
'-b',
|
|
95
|
+
'--browser',
|
|
96
|
+
default='chrome',
|
|
97
|
+
type=click.Choice(BROWSER_CHOICES),
|
|
98
|
+
help='Browser to read cookies from.',
|
|
99
|
+
)
|
|
100
|
+
@click.option('-p', '--profile', default='Default', help='Browser profile.')
|
|
101
|
+
@click.option('-d', '--debug', is_flag=True, help='Enable debug output.')
|
|
102
|
+
@click.option(
|
|
103
|
+
'-C',
|
|
104
|
+
'--include-comments',
|
|
105
|
+
is_flag=True,
|
|
106
|
+
help='Also download all comments (extends download time significantly).',
|
|
107
|
+
)
|
|
108
|
+
@click.option('-u', '--unsave', is_flag=True, help='Unsave posts after successful archive.')
|
|
109
|
+
def save_saved_main(
|
|
110
|
+
output_dir: str,
|
|
111
|
+
browser: BrowserName = 'chrome',
|
|
112
|
+
profile: str = 'Default',
|
|
113
|
+
*,
|
|
114
|
+
debug: bool = False,
|
|
115
|
+
include_comments: bool = False,
|
|
116
|
+
unsave: bool = False,
|
|
117
|
+
) -> None:
|
|
118
|
+
"""Archive your saved posts.""" # noqa: DOC501
|
|
119
|
+
setup_logging(
|
|
120
|
+
debug=debug, loggers={'instagram_archiver': {'handlers': ('console',), 'propagate': False}}
|
|
121
|
+
)
|
|
122
|
+
try:
|
|
123
|
+
SavedScraper(browser, profile, output_dir, comments=include_comments).process(unsave=unsave)
|
|
124
|
+
except UnexpectedRedirect as e:
|
|
125
|
+
click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
|
|
126
|
+
raise click.Abort from e
|
|
127
|
+
except Exception as e:
|
|
128
|
+
if isinstance(e, KeyboardInterrupt) or debug:
|
|
129
|
+
raise
|
|
130
|
+
click.echo('Run with --debug for more information.', err=True)
|
|
131
|
+
raise click.Abort from e
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Instagram client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contextlib import chdir
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, TypeVar, override
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import sqlite3
|
|
12
|
+
|
|
13
|
+
from requests import HTTPError
|
|
14
|
+
from yt_dlp_utils import get_configured_yt_dlp
|
|
15
|
+
|
|
16
|
+
from .client import InstagramClient
|
|
17
|
+
from .constants import LOG_SCHEMA
|
|
18
|
+
from .typing import (
|
|
19
|
+
BrowserName,
|
|
20
|
+
WebProfileInfo,
|
|
21
|
+
XDTAPIV1FeedUserTimelineGraphQLConnectionContainer,
|
|
22
|
+
)
|
|
23
|
+
from .utils import SaveCommentsCheckDisabledMixin
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from types import TracebackType
|
|
27
|
+
|
|
28
|
+
__all__ = ('ProfileScraper',)
|
|
29
|
+
|
|
30
|
+
T = TypeVar('T')
|
|
31
|
+
log = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _clean_url(url: str) -> str:
|
|
35
|
+
parsed = urlparse(url)
|
|
36
|
+
return f'https://{parsed.netloc}{parsed.path}'
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ProfileScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
|
|
40
|
+
"""The scraper."""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
username: str,
|
|
45
|
+
*,
|
|
46
|
+
log_file: str | Path | None = None,
|
|
47
|
+
output_dir: str | Path | None = None,
|
|
48
|
+
disable_log: bool = False,
|
|
49
|
+
browser: BrowserName = 'chrome',
|
|
50
|
+
browser_profile: str = 'Default',
|
|
51
|
+
comments: bool = False,
|
|
52
|
+
) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Initialise ``ProfileScraper``.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
username : str
|
|
59
|
+
The username to scrape.
|
|
60
|
+
log_file : str | Path | None
|
|
61
|
+
The log file to use.
|
|
62
|
+
output_dir : str | Path | None
|
|
63
|
+
The output directory to save the posts to.
|
|
64
|
+
disable_log : bool
|
|
65
|
+
Whether to disable logging or not.
|
|
66
|
+
browser : BrowserName
|
|
67
|
+
The browser to use.
|
|
68
|
+
browser_profile : str
|
|
69
|
+
The browser profile to use.
|
|
70
|
+
comments : bool
|
|
71
|
+
Whether to save comments or not.
|
|
72
|
+
"""
|
|
73
|
+
super().__init__(browser, browser_profile)
|
|
74
|
+
self._no_log = disable_log
|
|
75
|
+
self._output_dir = Path(output_dir or Path.cwd() / username)
|
|
76
|
+
self._output_dir.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
self._log_db = Path(log_file or self._output_dir / '.log.db')
|
|
78
|
+
self._connection = sqlite3.connect(self._log_db)
|
|
79
|
+
self._cursor = self._connection.cursor()
|
|
80
|
+
self._setup_db()
|
|
81
|
+
self._username = username
|
|
82
|
+
self.should_save_comments = comments
|
|
83
|
+
|
|
84
|
+
def _setup_db(self) -> None:
|
|
85
|
+
if self._no_log:
|
|
86
|
+
return
|
|
87
|
+
existed = self._log_db.exists()
|
|
88
|
+
if not existed or (existed and self._log_db.stat().st_size == 0):
|
|
89
|
+
log.debug('Creating schema.')
|
|
90
|
+
self._cursor.execute(LOG_SCHEMA)
|
|
91
|
+
|
|
92
|
+
@override
|
|
93
|
+
def save_to_log(self, url: str) -> None:
|
|
94
|
+
if self._no_log:
|
|
95
|
+
return
|
|
96
|
+
self._cursor.execute('INSERT INTO log (url) VALUES (?)', (_clean_url(url),))
|
|
97
|
+
self._connection.commit()
|
|
98
|
+
|
|
99
|
+
@override
|
|
100
|
+
def is_saved(self, url: str) -> bool:
|
|
101
|
+
if self._no_log:
|
|
102
|
+
return False
|
|
103
|
+
self._cursor.execute('SELECT COUNT(url) FROM log WHERE url = ?', (_clean_url(url),))
|
|
104
|
+
count: int
|
|
105
|
+
(count,) = self._cursor.fetchone()
|
|
106
|
+
return count == 1
|
|
107
|
+
|
|
108
|
+
@override
|
|
109
|
+
def __exit__(
|
|
110
|
+
self, _: type[BaseException] | None, __: BaseException | None, ___: TracebackType | None
|
|
111
|
+
) -> None:
|
|
112
|
+
"""Clean up."""
|
|
113
|
+
self._cursor.close()
|
|
114
|
+
self._connection.close()
|
|
115
|
+
|
|
116
|
+
def process(self) -> None:
|
|
117
|
+
"""Process posts."""
|
|
118
|
+
with chdir(self._output_dir):
|
|
119
|
+
self.get_text(f'https://www.instagram.com/{self._username}/')
|
|
120
|
+
self.add_csrf_token_header()
|
|
121
|
+
r = self.get_json(
|
|
122
|
+
'https://i.instagram.com/api/v1/users/web_profile_info/',
|
|
123
|
+
params={'username': self._username},
|
|
124
|
+
cast_to=WebProfileInfo,
|
|
125
|
+
)
|
|
126
|
+
if 'data' in r:
|
|
127
|
+
with Path('web_profile_info.json').open('w', encoding='utf-8') as f:
|
|
128
|
+
json.dump(r, f, indent=2, sort_keys=True)
|
|
129
|
+
user_info = r['data']['user']
|
|
130
|
+
if not self.is_saved(user_info['profile_pic_url_hd']):
|
|
131
|
+
with Path('profile_pic.jpg').open('wb') as f:
|
|
132
|
+
f.writelines(
|
|
133
|
+
self.session.get(
|
|
134
|
+
user_info['profile_pic_url_hd'], stream=True
|
|
135
|
+
).iter_content(chunk_size=512)
|
|
136
|
+
)
|
|
137
|
+
self.save_to_log(user_info['profile_pic_url_hd'])
|
|
138
|
+
try:
|
|
139
|
+
for item in self.highlights_tray(user_info['id'])['tray']:
|
|
140
|
+
self.add_video_url(
|
|
141
|
+
'https://www.instagram.com/stories/highlights/'
|
|
142
|
+
f'{item["id"].split(":")[-1]}/'
|
|
143
|
+
)
|
|
144
|
+
except HTTPError:
|
|
145
|
+
log.exception('Failed to get highlights data.')
|
|
146
|
+
self.save_edges(user_info['edge_owner_to_timeline_media']['edges'])
|
|
147
|
+
else:
|
|
148
|
+
log.warning(
|
|
149
|
+
'Failed to get user info. Profile information and image will not be saved.'
|
|
150
|
+
)
|
|
151
|
+
d = self.graphql_query(
|
|
152
|
+
{
|
|
153
|
+
'data': {
|
|
154
|
+
'count': 12,
|
|
155
|
+
'include_reel_media_seen_timestamp': True,
|
|
156
|
+
'include_relationship_info': True,
|
|
157
|
+
'latest_besties_reel_media': True,
|
|
158
|
+
'latest_reel_media': True,
|
|
159
|
+
},
|
|
160
|
+
'username': self._username,
|
|
161
|
+
'__relay_internal__pv__PolarisIsLoggedInrelayprovider': True,
|
|
162
|
+
'__relay_internal__pv__PolarisShareSheetV3relayprovider': True,
|
|
163
|
+
},
|
|
164
|
+
cast_to=XDTAPIV1FeedUserTimelineGraphQLConnectionContainer,
|
|
165
|
+
)
|
|
166
|
+
if not d:
|
|
167
|
+
log.error('First GraphQL query failed.')
|
|
168
|
+
else:
|
|
169
|
+
self.save_edges(d['xdt_api__v1__feed__user_timeline_graphql_connection']['edges'])
|
|
170
|
+
page_info = d['xdt_api__v1__feed__user_timeline_graphql_connection']['page_info']
|
|
171
|
+
while page_info['has_next_page']:
|
|
172
|
+
d = self.graphql_query(
|
|
173
|
+
{
|
|
174
|
+
'after': page_info['end_cursor'],
|
|
175
|
+
'before': None,
|
|
176
|
+
'data': {
|
|
177
|
+
'count': 12,
|
|
178
|
+
'include_reel_media_seen_timestamp': True,
|
|
179
|
+
'include_relationship_info': True,
|
|
180
|
+
'latest_besties_reel_media': True,
|
|
181
|
+
'latest_reel_media': True,
|
|
182
|
+
},
|
|
183
|
+
'first': 12,
|
|
184
|
+
'last': None,
|
|
185
|
+
'username': self._username,
|
|
186
|
+
'__relay_internal__pv__PolarisIsLoggedInrelayprovider': True,
|
|
187
|
+
'__relay_internal__pv__PolarisShareSheetV3relayprovider': True,
|
|
188
|
+
},
|
|
189
|
+
cast_to=XDTAPIV1FeedUserTimelineGraphQLConnectionContainer,
|
|
190
|
+
)
|
|
191
|
+
if not d:
|
|
192
|
+
break
|
|
193
|
+
page_info = d['xdt_api__v1__feed__user_timeline_graphql_connection'][
|
|
194
|
+
'page_info'
|
|
195
|
+
]
|
|
196
|
+
self.save_edges(
|
|
197
|
+
d['xdt_api__v1__feed__user_timeline_graphql_connection']['edges']
|
|
198
|
+
)
|
|
199
|
+
if self.video_urls:
|
|
200
|
+
with get_configured_yt_dlp() as ydl:
|
|
201
|
+
while self.video_urls and (url := self.video_urls.pop()):
|
|
202
|
+
if self.is_saved(url):
|
|
203
|
+
log.info('%s is already saved.', url)
|
|
204
|
+
continue
|
|
205
|
+
if ydl.extract_info(url):
|
|
206
|
+
log.info('Downloading video: %s', url)
|
|
207
|
+
self.save_to_log(url)
|
|
208
|
+
else:
|
|
209
|
+
self.failed_urls.add(url)
|
|
210
|
+
if self.failed_urls:
|
|
211
|
+
log.warning('Some URIs failed. Check failed.txt.')
|
|
212
|
+
with Path('failed.txt').open('w', encoding='utf-8') as f:
|
|
213
|
+
for url in self.failed_urls:
|
|
214
|
+
f.write(f'{url}\n')
|
|
File without changes
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Saved posts scraper."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contextlib import chdir
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from .client import InstagramClient
|
|
11
|
+
from .constants import API_HEADERS, PAGE_FETCH_HEADERS
|
|
12
|
+
from .utils import SaveCommentsCheckDisabledMixin
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Iterable
|
|
16
|
+
|
|
17
|
+
from .typing import BrowserName
|
|
18
|
+
|
|
19
|
+
__all__ = ('SavedScraper',)
|
|
20
|
+
log = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SavedScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
|
|
24
|
+
"""Scrape saved posts."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
browser: BrowserName = 'chrome',
|
|
29
|
+
browser_profile: str = 'Default',
|
|
30
|
+
output_dir: str | Path | None = None,
|
|
31
|
+
*,
|
|
32
|
+
comments: bool = False,
|
|
33
|
+
) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Initialise ``SavedScraper``.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
browser : BrowserName
|
|
40
|
+
The browser to use.
|
|
41
|
+
browser_profile : str
|
|
42
|
+
The browser profile to use.
|
|
43
|
+
output_dir : str | Path | None
|
|
44
|
+
The output directory to save the posts to.
|
|
45
|
+
comments : bool
|
|
46
|
+
Whether to save comments or not.
|
|
47
|
+
"""
|
|
48
|
+
super().__init__(browser, browser_profile)
|
|
49
|
+
self._output_dir = Path(output_dir or Path.cwd() / '@@saved-posts@@')
|
|
50
|
+
Path(self._output_dir).mkdir(parents=True, exist_ok=True)
|
|
51
|
+
self.should_save_comments = comments
|
|
52
|
+
|
|
53
|
+
def unsave(self, items: Iterable[str]) -> None:
|
|
54
|
+
"""Unsave saved posts."""
|
|
55
|
+
for item in items:
|
|
56
|
+
log.info('Unsaving %s.', item)
|
|
57
|
+
self.session.post(
|
|
58
|
+
f'https://www.instagram.com/web/save/{item}/unsave/', headers=API_HEADERS
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def process(self, *, unsave: bool = False) -> None:
|
|
62
|
+
"""Process the saved posts."""
|
|
63
|
+
with chdir(self._output_dir):
|
|
64
|
+
self.add_csrf_token_header()
|
|
65
|
+
self.session.get('https://www.instagram.com/', headers=PAGE_FETCH_HEADERS)
|
|
66
|
+
feed = self.get_json(
|
|
67
|
+
'https://www.instagram.com/api/v1/feed/saved/posts/', cast_to=dict[str, Any]
|
|
68
|
+
)
|
|
69
|
+
self.save_edges(
|
|
70
|
+
{
|
|
71
|
+
'node': {
|
|
72
|
+
'__typename': 'XDTMediaDict',
|
|
73
|
+
'id': item['media']['id'],
|
|
74
|
+
'code': item['media']['code'],
|
|
75
|
+
'owner': item['media']['owner'],
|
|
76
|
+
'pk': item['media']['pk'],
|
|
77
|
+
'video_dash_manifest': item['media'].get('video_dash_manifest'),
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
for item in feed['items']
|
|
81
|
+
)
|
|
82
|
+
if unsave:
|
|
83
|
+
self.unsave(item['media']['code'] for item in feed['items'])
|
|
84
|
+
if feed.get('more_available'):
|
|
85
|
+
log.warning('Unhandled pagination.')
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Typing helpers."""
|
|
2
|
+
|
|
3
|
+
# ruff: noqa: D101
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from typing import TYPE_CHECKING, Literal, NotRequired, TypedDict
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
|
|
11
|
+
__all__ = (
|
|
12
|
+
'BrowserName',
|
|
13
|
+
'CarouselMedia',
|
|
14
|
+
'Comments',
|
|
15
|
+
'Edge',
|
|
16
|
+
'HasID',
|
|
17
|
+
'HighlightsTray',
|
|
18
|
+
'MediaInfo',
|
|
19
|
+
'MediaInfoItem',
|
|
20
|
+
'MediaInfoItemImageVersions2Candidate',
|
|
21
|
+
'UserInfo',
|
|
22
|
+
'WebProfileInfo',
|
|
23
|
+
'WebProfileInfoData',
|
|
24
|
+
'XDTAPIV1FeedUserTimelineGraphQLConnection',
|
|
25
|
+
'XDTAPIV1FeedUserTimelineGraphQLConnectionContainer',
|
|
26
|
+
'XDTMediaDict',
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class MediaInfoItemVideoVersion(TypedDict):
|
|
31
|
+
height: int
|
|
32
|
+
url: str
|
|
33
|
+
width: int
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class MediaInfoItemImageVersions2Candidate(TypedDict):
|
|
37
|
+
height: int
|
|
38
|
+
"""Height of the image."""
|
|
39
|
+
url: str
|
|
40
|
+
"""URL of the image."""
|
|
41
|
+
width: int
|
|
42
|
+
"""Width of the image."""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class HighlightItem(TypedDict):
|
|
46
|
+
id: str
|
|
47
|
+
"""Identifier."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class HighlightsTray(TypedDict):
|
|
51
|
+
tray: Sequence[HighlightItem]
|
|
52
|
+
"""Highlights tray items."""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class PageInfo(TypedDict):
|
|
56
|
+
end_cursor: str
|
|
57
|
+
"""End cursor for pagination."""
|
|
58
|
+
has_next_page: bool
|
|
59
|
+
"""Whether there are more pages."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class EdgeOwnerToTimelineMedia(TypedDict):
|
|
63
|
+
edges: Sequence[Edge]
|
|
64
|
+
page_info: PageInfo
|
|
65
|
+
"""Pagination information."""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class UserInfo(TypedDict):
|
|
69
|
+
"""User information."""
|
|
70
|
+
|
|
71
|
+
edge_owner_to_timeline_media: EdgeOwnerToTimelineMedia
|
|
72
|
+
"""Timeline media edge."""
|
|
73
|
+
id: str
|
|
74
|
+
"""User ID."""
|
|
75
|
+
profile_pic_url_hd: str
|
|
76
|
+
"""Profile picture URL."""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class MediaInfoItemImageVersions2(TypedDict):
|
|
80
|
+
candidates: Sequence[MediaInfoItemImageVersions2Candidate]
|
|
81
|
+
"""Image versions."""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class CarouselMedia(TypedDict):
|
|
85
|
+
image_versions2: MediaInfoItemImageVersions2
|
|
86
|
+
"""Image versions."""
|
|
87
|
+
id: str
|
|
88
|
+
"""Identifier."""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class HasID(TypedDict):
|
|
92
|
+
"""Dictionary with an ``id`` field."""
|
|
93
|
+
|
|
94
|
+
id: str
|
|
95
|
+
"""Identifier."""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class MediaInfoItem(TypedDict):
|
|
99
|
+
"""Media information item."""
|
|
100
|
+
|
|
101
|
+
carousel_media: NotRequired[Sequence[CarouselMedia] | None]
|
|
102
|
+
"""Carousel media items."""
|
|
103
|
+
image_versions2: MediaInfoItemImageVersions2
|
|
104
|
+
"""Image versions."""
|
|
105
|
+
id: str
|
|
106
|
+
"""Identifier."""
|
|
107
|
+
taken_at: int
|
|
108
|
+
"""Timestamp when the media was taken"""
|
|
109
|
+
user: HasID
|
|
110
|
+
"""User who posted the media."""
|
|
111
|
+
video_dash_manifest: NotRequired[str | None]
|
|
112
|
+
"""URL of the video dash manifest."""
|
|
113
|
+
video_duration: float
|
|
114
|
+
"""Duration of the video in seconds."""
|
|
115
|
+
video_versions: Sequence[MediaInfoItemVideoVersion]
|
|
116
|
+
"""Video versions."""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class Comments(TypedDict):
|
|
120
|
+
"""Comments container."""
|
|
121
|
+
|
|
122
|
+
can_view_more_preview_comments: bool
|
|
123
|
+
"""Whether more preview comments can be viewed."""
|
|
124
|
+
comments: Sequence[HasID]
|
|
125
|
+
"""List of comments."""
|
|
126
|
+
next_min_id: str
|
|
127
|
+
"""Next minimum ID for pagination."""
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class MediaInfo(TypedDict):
|
|
131
|
+
"""Media information."""
|
|
132
|
+
|
|
133
|
+
items: Sequence[MediaInfoItem]
|
|
134
|
+
"""List of media items."""
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class Owner(TypedDict):
|
|
138
|
+
id: str
|
|
139
|
+
"""Owner ID."""
|
|
140
|
+
username: str
|
|
141
|
+
"""Owner username."""
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class XDTMediaDict(TypedDict):
|
|
145
|
+
__typename: Literal['XDTMediaDict']
|
|
146
|
+
"""Type name."""
|
|
147
|
+
code: str
|
|
148
|
+
"""Short code."""
|
|
149
|
+
id: str
|
|
150
|
+
"""Media ID."""
|
|
151
|
+
owner: Owner
|
|
152
|
+
"""Owner information."""
|
|
153
|
+
pk: str
|
|
154
|
+
"""Primary key. Also carousel ID."""
|
|
155
|
+
video_dash_manifest: NotRequired[str | None]
|
|
156
|
+
"""Video dash manifest URL, if available."""
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class Edge(TypedDict):
|
|
160
|
+
"""Edge of a graph."""
|
|
161
|
+
|
|
162
|
+
node: XDTMediaDict
|
|
163
|
+
"""Node at this edge."""
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class XDTAPIV1FeedUserTimelineGraphQLConnection(TypedDict):
|
|
167
|
+
edges: Sequence[Edge]
|
|
168
|
+
"""Edges of the graph."""
|
|
169
|
+
page_info: PageInfo
|
|
170
|
+
"""Pagination information."""
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class XDTAPIV1FeedUserTimelineGraphQLConnectionContainer(TypedDict):
|
|
174
|
+
"""Container for :py:class:`XDTAPIV1FeedUserTimelineGraphQLConnection`."""
|
|
175
|
+
|
|
176
|
+
xdt_api__v1__feed__user_timeline_graphql_connection: XDTAPIV1FeedUserTimelineGraphQLConnection
|
|
177
|
+
"""User timeline data."""
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class WebProfileInfoData(TypedDict):
|
|
181
|
+
user: UserInfo
|
|
182
|
+
"""User information."""
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class WebProfileInfo(TypedDict):
|
|
186
|
+
"""Profile information container."""
|
|
187
|
+
|
|
188
|
+
data: NotRequired[WebProfileInfoData]
|
|
189
|
+
"""Profile data."""
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
BrowserName = Literal[
|
|
193
|
+
'brave', 'chrome', 'chromium', 'edge', 'firefox', 'opera', 'safari', 'vivaldi'
|
|
194
|
+
]
|
|
195
|
+
"""Possible browser choices to get cookies from."""
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Utility functions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, override
|
|
7
|
+
import json
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from .typing import Edge
|
|
13
|
+
|
|
14
|
+
__all__ = (
|
|
15
|
+
'JSONFormattedString',
|
|
16
|
+
'UnknownMimetypeError',
|
|
17
|
+
'get_extension',
|
|
18
|
+
'json_dumps_formatted',
|
|
19
|
+
'write_if_new',
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
T = TypeVar('T')
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class JSONFormattedString:
|
|
26
|
+
"""Contains a formatted version of the JSON str and the original value."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, formatted: str, original: Any) -> None:
|
|
29
|
+
self.formatted = formatted
|
|
30
|
+
"""Formatted JSON string."""
|
|
31
|
+
self.original_value = original
|
|
32
|
+
"""Original value."""
|
|
33
|
+
|
|
34
|
+
@override
|
|
35
|
+
def __str__(self) -> str:
|
|
36
|
+
return self.formatted
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def json_dumps_formatted(obj: Any) -> JSONFormattedString:
|
|
40
|
+
"""
|
|
41
|
+
Return a special object with the formatted version of the JSON str and the original.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
obj : Any
|
|
46
|
+
The object to be formatted.
|
|
47
|
+
"""
|
|
48
|
+
return JSONFormattedString(json.dumps(obj, sort_keys=True, indent=2), obj)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def write_if_new(target: Path | str, content: str | bytes, mode: str = 'w') -> None:
|
|
52
|
+
"""Write a file only if it will be a new file."""
|
|
53
|
+
if not Path(target).is_file():
|
|
54
|
+
with click.open_file(str(target), mode) as f:
|
|
55
|
+
f.write(content)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class UnknownMimetypeError(Exception):
|
|
59
|
+
"""Raised when an unknown mimetype is encountered in :py:func:`~get_extension`."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_extension(mimetype: str) -> Literal['png', 'jpg']:
|
|
63
|
+
"""
|
|
64
|
+
Get the appropriate three-letter extension for a mimetype.
|
|
65
|
+
|
|
66
|
+
Raises
|
|
67
|
+
------
|
|
68
|
+
UnknownMimetypeError
|
|
69
|
+
If the mimetype is not recognised.
|
|
70
|
+
"""
|
|
71
|
+
if mimetype == 'image/jpeg':
|
|
72
|
+
return 'jpg'
|
|
73
|
+
if mimetype == 'image/png':
|
|
74
|
+
return 'png'
|
|
75
|
+
raise UnknownMimetypeError(mimetype)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if TYPE_CHECKING:
|
|
79
|
+
|
|
80
|
+
class InstagramClientInterface(Protocol):
|
|
81
|
+
should_save_comments: bool
|
|
82
|
+
|
|
83
|
+
def save_comments(self, edge: Edge) -> None: ...
|
|
84
|
+
|
|
85
|
+
else:
|
|
86
|
+
InstagramClientInterface = object
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class SaveCommentsCheckDisabledMixin(InstagramClientInterface):
|
|
90
|
+
"""Mixin to control saving comments."""
|
|
91
|
+
|
|
92
|
+
@override
|
|
93
|
+
def save_comments(self, edge: Edge) -> None:
|
|
94
|
+
if not self.should_save_comments:
|
|
95
|
+
return
|
|
96
|
+
super().save_comments(edge) # type: ignore[safe-super]
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: instagram-archiver
|
|
3
|
+
Version: 0.3.3
|
|
4
|
+
Summary: Save Instagram content you have access to.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE.txt
|
|
7
|
+
Keywords: command line,instagram
|
|
8
|
+
Author: Andrew Udvare
|
|
9
|
+
Author-email: audvare@gmail.com
|
|
10
|
+
Requires-Python: >=3.12,<4.0
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Dist: bascom (>=0.0.4,<0.0.5)
|
|
18
|
+
Requires-Dist: click (>=8.3.0,<9.0.0)
|
|
19
|
+
Requires-Dist: requests (>=2.32.5,<3.0.0)
|
|
20
|
+
Requires-Dist: typing-extensions (>=4.15.0,<5.0.0)
|
|
21
|
+
Requires-Dist: yt-dlp-utils (>=0.0.6,<0.0.7)
|
|
22
|
+
Project-URL: Documentation, https://instagram-archiver.readthedocs.org
|
|
23
|
+
Project-URL: Homepage, https://tatsh.github.io/instagram-archiver/
|
|
24
|
+
Project-URL: Issues, https://github.com/Tatsh/instagram-archiver/issues
|
|
25
|
+
Project-URL: Repository, https://github.com/Tatsh/instagram-archiver
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# instagram-archiver
|
|
29
|
+
|
|
30
|
+
[](https://www.python.org/)
|
|
31
|
+
[](https://pypi.org/project/instagram-archiver/)
|
|
32
|
+
[](https://github.com/Tatsh/instagram-archiver/tags)
|
|
33
|
+
[](https://github.com/Tatsh/instagram-archiver/blob/master/LICENSE.txt)
|
|
34
|
+
[](https://github.com/Tatsh/instagram-archiver/compare/v0.3.3...master)
|
|
35
|
+
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/codeql.yml)
|
|
36
|
+
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml)
|
|
37
|
+
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml)
|
|
38
|
+
[](https://coveralls.io/github/Tatsh/instagram-archiver?branch=master)
|
|
39
|
+
[](https://instagram-archiver.readthedocs.org/?badge=latest)
|
|
40
|
+
[](http://mypy-lang.org/)
|
|
41
|
+
[](https://github.com/pre-commit/pre-commit)
|
|
42
|
+
[](http://www.pydocstyle.org/en/stable/)
|
|
43
|
+
[](https://docs.pytest.org/en/stable/)
|
|
44
|
+
[](https://github.com/astral-sh/ruff)
|
|
45
|
+
[](https://pepy.tech/project/instagram-archiver)
|
|
46
|
+
[](https://github.com/Tatsh/instagram-archiver/stargazers)
|
|
47
|
+
|
|
48
|
+
[](https://bsky.app/profile/Tatsh.bsky.social)
|
|
49
|
+
[](https://hostux.social/@Tatsh)
|
|
50
|
+
|
|
51
|
+
Save Instagram content you have access to.
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
### Poetry
|
|
56
|
+
|
|
57
|
+
```shell
|
|
58
|
+
poetry add instagram-archiver
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Pip
|
|
62
|
+
|
|
63
|
+
```shell
|
|
64
|
+
pip install instagram-archiver
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Usage
|
|
68
|
+
|
|
69
|
+
```plain
|
|
70
|
+
Usage: instagram-archiver [OPTIONS] USERNAME
|
|
71
|
+
|
|
72
|
+
Archive a profile's posts.
|
|
73
|
+
|
|
74
|
+
Options:
|
|
75
|
+
-o, --output-dir DIRECTORY Output directory.
|
|
76
|
+
-b, --browser [brave|chrome|chromium|edge|opera|vivaldi|firefox|safari]
|
|
77
|
+
Browser to read cookies from.
|
|
78
|
+
-p, --profile TEXT Browser profile.
|
|
79
|
+
-d, --debug Enable debug output.
|
|
80
|
+
--no-log Ignore log (re-fetch everything).
|
|
81
|
+
-C, --include-comments Also download all comments (extends download
|
|
82
|
+
time significantly).
|
|
83
|
+
-h, --help Show this message and exit.
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Typical use:
|
|
87
|
+
|
|
88
|
+
```shell
|
|
89
|
+
instagram-archiver -o ~/instagram-backups/username username
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### `instagram-save-saved`
|
|
93
|
+
|
|
94
|
+
This tool saves your saved posts (at `www.instagram.com/username/saved/all-posts`).
|
|
95
|
+
|
|
96
|
+
```plain
|
|
97
|
+
Usage: instagram-save-saved [OPTIONS]
|
|
98
|
+
|
|
99
|
+
Archive your saved posts.
|
|
100
|
+
|
|
101
|
+
Options:
|
|
102
|
+
-o, --output-dir DIRECTORY Output directory.
|
|
103
|
+
-b, --browser [brave|chrome|chromium|edge|opera|vivaldi|firefox|safari]
|
|
104
|
+
Browser to read cookies from.
|
|
105
|
+
-p, --profile TEXT Browser profile.
|
|
106
|
+
-d, --debug Enable debug output.
|
|
107
|
+
-C, --include-comments Also download all comments (extends download
|
|
108
|
+
time significantly).
|
|
109
|
+
-u, --unsave Unsave posts after successful archive.
|
|
110
|
+
-h, --help Show this message and exit.
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Notes
|
|
114
|
+
|
|
115
|
+
The default output path is the username under the current working directory.
|
|
116
|
+
|
|
117
|
+
Videos are saved using yt-dlp and its respective configuration.
|
|
118
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
instagram_archiver/__init__.py,sha256=d74_Lpwy4wMo0DjrJEFfFqI37yBeM_fso7n-a6IJVwY,271
|
|
2
|
+
instagram_archiver/__main__.py,sha256=oQb8YAn5SGNM15OuCcOYPqBDgQtNFZ7FZJvVdLRsckE,116
|
|
3
|
+
instagram_archiver/client.py,sha256=AnTFq7NivYfEBUIGHn-teh2Gw_vsgUT2XscOoe3wyC4,10689
|
|
4
|
+
instagram_archiver/constants.py,sha256=eN33N_nd3Nuo6fnlXZzhYl0QfGBjVwdtq9dZwRys8_M,1446
|
|
5
|
+
instagram_archiver/main.py,sha256=LYnJcnRZATrGkuXh_oE7X8Xd_Li7NXzXIFfpnSh2AfY,4032
|
|
6
|
+
instagram_archiver/profile_scraper.py,sha256=uRcVA39h3X59LFXbBST2aWyyxgcemyhM2qd8BBibeT8,8345
|
|
7
|
+
instagram_archiver/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
instagram_archiver/saved_scraper.py,sha256=9x9UVspraq1H3zyG8gn7Q_2GPu7iCPG7zZpnsARZLd8,2850
|
|
9
|
+
instagram_archiver/typing.py,sha256=1vOr3qIiOZhvzSSBpUc3iFKgEZE5ryd_oCysF_ZGpls,4395
|
|
10
|
+
instagram_archiver/utils.py,sha256=4wePufLtJZjiSRucMz5kD-_mM1SJyOynusZONqvyonE,2378
|
|
11
|
+
instagram_archiver-0.3.3.dist-info/METADATA,sha256=MzIVF7R_Vm3kOXnb646fwS8DuoHylBfV2ydwoQbgLuk,5996
|
|
12
|
+
instagram_archiver-0.3.3.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
13
|
+
instagram_archiver-0.3.3.dist-info/entry_points.txt,sha256=kNXd0Sy6896DEBRcx2mVYiaE-OR9-XR56MpWuaNa49g,128
|
|
14
|
+
instagram_archiver-0.3.3.dist-info/licenses/LICENSE.txt,sha256=cDLmbhzFwEUz5FL_OnA6Jp9zdz80330J6YyEq-00yNQ,1093
|
|
15
|
+
instagram_archiver-0.3.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 instagram-archiver authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
|
|
6
|
+
associated documentation files (the "Software"), to deal in the Software without restriction,
|
|
7
|
+
including without limitation the rights to use, copy, modify, merge, publish, distribute,
|
|
8
|
+
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
|
|
9
|
+
furnished to do so, subject to the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or
|
|
12
|
+
substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
|
|
15
|
+
NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
16
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
|
17
|
+
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
|
|
18
|
+
OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|