instagram-archiver 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of instagram-archiver might be problematic. Click here for more details.
- instagram_archiver/__init__.py +8 -2
- instagram_archiver/__main__.py +4 -1
- instagram_archiver/client.py +232 -270
- instagram_archiver/constants.py +51 -95
- instagram_archiver/main.py +80 -57
- instagram_archiver/profile_scraper.py +194 -0
- instagram_archiver/saved_scraper.py +79 -0
- instagram_archiver/typing.py +172 -0
- instagram_archiver/utils.py +97 -79
- instagram_archiver-0.3.1.dist-info/LICENSE.txt +18 -0
- instagram_archiver-0.3.1.dist-info/METADATA +119 -0
- instagram_archiver-0.3.1.dist-info/RECORD +15 -0
- {instagram_archiver-0.2.1.dist-info → instagram_archiver-0.3.1.dist-info}/WHEEL +1 -1
- instagram_archiver-0.3.1.dist-info/entry_points.txt +4 -0
- instagram_archiver/find_query_hashes.py +0 -31
- instagram_archiver/ig_typing.py +0 -173
- instagram_archiver-0.2.1.dist-info/LICENSE.txt +0 -21
- instagram_archiver-0.2.1.dist-info/METADATA +0 -44
- instagram_archiver-0.2.1.dist-info/RECORD +0 -14
- instagram_archiver-0.2.1.dist-info/entry_points.txt +0 -3
instagram_archiver/__init__.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
-
|
|
1
|
+
"""Instagram archiver."""
|
|
2
|
+
from __future__ import annotations
|
|
2
3
|
|
|
3
|
-
|
|
4
|
+
from .client import InstagramClient
|
|
5
|
+
from .profile_scraper import ProfileScraper
|
|
6
|
+
from .saved_scraper import SavedScraper
|
|
7
|
+
|
|
8
|
+
__all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
|
|
9
|
+
__version__ = 'v0.3.1'
|
instagram_archiver/__main__.py
CHANGED
instagram_archiver/client.py
CHANGED
|
@@ -1,168 +1,223 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
|
|
1
|
+
"""Generic client."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from http import HTTPStatus
|
|
5
|
+
from os import utime
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from
|
|
6
|
-
from typing import Collection, Literal, Mapping, Type, TypeVar, overload
|
|
7
|
-
from urllib.parse import urlparse
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
|
|
8
8
|
import json
|
|
9
|
-
import
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
from
|
|
13
|
-
from ratelimit import limits, sleep_and_retry
|
|
14
|
-
from requests.adapters import HTTPAdapter
|
|
15
|
-
from urllib3.util.retry import Retry
|
|
16
|
-
from yt_dlp.cookies import extract_cookies_from_browser
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from requests import HTTPError
|
|
12
|
+
from yt_dlp_utils import setup_session
|
|
17
13
|
import requests
|
|
18
|
-
import yt_dlp
|
|
19
14
|
|
|
20
|
-
from .constants import
|
|
21
|
-
from .
|
|
22
|
-
|
|
23
|
-
|
|
15
|
+
from .constants import API_HEADERS, SHARED_HEADERS
|
|
16
|
+
from .typing import (
|
|
17
|
+
CarouselMedia,
|
|
18
|
+
Comments,
|
|
19
|
+
Edge,
|
|
20
|
+
HighlightsTray,
|
|
21
|
+
MediaInfo,
|
|
22
|
+
MediaInfoItem,
|
|
23
|
+
MediaInfoItemImageVersions2Candidate,
|
|
24
|
+
)
|
|
25
|
+
from .utils import get_extension, json_dumps_formatted, write_if_new
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from collections.abc import Iterable, Mapping
|
|
29
|
+
from types import TracebackType
|
|
24
30
|
|
|
25
|
-
|
|
31
|
+
from .typing import BrowserName
|
|
32
|
+
|
|
33
|
+
__all__ = ('CSRFTokenNotFound', 'InstagramClient', 'UnexpectedRedirect')
|
|
26
34
|
|
|
27
35
|
T = TypeVar('T')
|
|
36
|
+
log = logging.getLogger(__name__)
|
|
28
37
|
|
|
29
38
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
return f'https://{parsed.netloc}{parsed.path}'
|
|
39
|
+
class CSRFTokenNotFound(RuntimeError):
|
|
40
|
+
"""CSRF token not found in cookies."""
|
|
33
41
|
|
|
34
42
|
|
|
35
|
-
class
|
|
36
|
-
|
|
43
|
+
class UnexpectedRedirect(RuntimeError):
|
|
44
|
+
"""Unexpected redirect in a request."""
|
|
37
45
|
|
|
38
46
|
|
|
39
47
|
class InstagramClient:
|
|
40
|
-
"""
|
|
41
|
-
def __init__(self,
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
self.
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
self.
|
|
59
|
-
|
|
60
|
-
self.
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
self.
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
return
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
48
|
+
"""Generic client for Instagram."""
|
|
49
|
+
def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
|
|
50
|
+
"""
|
|
51
|
+
Initialise the client.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
browser : str
|
|
56
|
+
The browser to use.
|
|
57
|
+
|
|
58
|
+
browser_profile : str
|
|
59
|
+
The browser profile to use.
|
|
60
|
+
"""
|
|
61
|
+
self.session = setup_session(browser,
|
|
62
|
+
browser_profile,
|
|
63
|
+
SHARED_HEADERS,
|
|
64
|
+
domains={'instagram.com'},
|
|
65
|
+
status_forcelist=(413, 429, 500, 502, 503, 504))
|
|
66
|
+
self.failed_urls: set[str] = set()
|
|
67
|
+
"""Set of failed URLs."""
|
|
68
|
+
self.video_urls: list[str] = []
|
|
69
|
+
"""List of video URLs to download."""
|
|
70
|
+
|
|
71
|
+
def add_video_url(self, url: str) -> None:
|
|
72
|
+
"""Add a video URL to the list of video URLs."""
|
|
73
|
+
log.info('Added video URL: %s', url)
|
|
74
|
+
self.video_urls.append(url)
|
|
75
|
+
|
|
76
|
+
def add_csrf_token_header(self) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Add CSRF token header to the session.
|
|
79
|
+
|
|
80
|
+
Raises
|
|
81
|
+
------
|
|
82
|
+
CSRFTokenNotFound
|
|
83
|
+
If the CSRF token is not found in the cookies.
|
|
84
|
+
"""
|
|
85
|
+
token = self.session.cookies.get('csrftoken')
|
|
86
|
+
if not token:
|
|
87
|
+
raise CSRFTokenNotFound
|
|
88
|
+
self.session.headers.update({'x-csrftoken': token})
|
|
89
|
+
|
|
90
|
+
def graphql_query(self,
|
|
91
|
+
variables: Mapping[str, Any],
|
|
92
|
+
*,
|
|
93
|
+
cast_to: type[T],
|
|
94
|
+
doc_id: str = '9806959572732215') -> T | None:
|
|
95
|
+
"""Make a GraphQL query."""
|
|
96
|
+
with self.session.post('https://www.instagram.com/graphql/query',
|
|
97
|
+
headers={
|
|
98
|
+
'content-type': 'application/x-www-form-urlencoded',
|
|
99
|
+
} | API_HEADERS,
|
|
100
|
+
data={
|
|
101
|
+
'doc_id': doc_id,
|
|
102
|
+
'variables': json.dumps(variables, separators=(',', ':'))
|
|
103
|
+
}) as r:
|
|
104
|
+
if r.status_code != HTTPStatus.OK:
|
|
105
|
+
return None
|
|
106
|
+
data = r.json()
|
|
107
|
+
assert isinstance(data, dict)
|
|
108
|
+
if (status := data.get('status')) != 'ok':
|
|
109
|
+
log.error('GraphQL status not "ok": %s', status)
|
|
110
|
+
return None
|
|
111
|
+
if data.get('errors'):
|
|
112
|
+
log.warning('Response has errors.')
|
|
113
|
+
log.debug('Response: %s', json.dumps(data, indent=2))
|
|
114
|
+
if not data.get('data'):
|
|
115
|
+
log.error('No data in response.')
|
|
116
|
+
return cast('T', data['data'])
|
|
117
|
+
|
|
118
|
+
def get_text(self, url: str, *, params: Mapping[str, str] | None = None) -> str:
|
|
119
|
+
"""Get text from a URL."""
|
|
120
|
+
with self.session.get(url, params=params, headers=API_HEADERS) as r:
|
|
121
|
+
r.raise_for_status()
|
|
122
|
+
return r.text
|
|
123
|
+
|
|
124
|
+
def highlights_tray(self, user_id: int | str) -> HighlightsTray:
|
|
125
|
+
"""Get the highlights tray data for a user."""
|
|
126
|
+
return self.get_json(
|
|
127
|
+
f'https://i.instagram.com/api/v1/highlights/{user_id}/highlights_tray/',
|
|
128
|
+
cast_to=HighlightsTray)
|
|
129
|
+
|
|
130
|
+
def __enter__(self) -> Self: # pragma: no cover
|
|
131
|
+
"""Recommended way to initialise the client."""
|
|
132
|
+
return self
|
|
133
|
+
|
|
134
|
+
def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
|
|
135
|
+
___: TracebackType | None) -> None:
|
|
136
|
+
"""Clean up."""
|
|
137
|
+
|
|
138
|
+
def is_saved(self, url: str) -> bool: # pragma: no cover
|
|
139
|
+
"""Check if a URL is already saved."""
|
|
140
|
+
return False
|
|
141
|
+
|
|
142
|
+
def save_to_log(self, url: str) -> None:
|
|
143
|
+
"""Save a URL to the log."""
|
|
144
|
+
|
|
145
|
+
def save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem, timestamp: int) -> None:
|
|
146
|
+
"""Save images in the image_versions2 dictionary."""
|
|
116
147
|
def key(x: MediaInfoItemImageVersions2Candidate) -> int:
|
|
117
148
|
return x['width'] * x['height']
|
|
118
149
|
|
|
119
|
-
best =
|
|
120
|
-
if self.
|
|
150
|
+
best = max(sub_item['image_versions2']['candidates'], key=key)
|
|
151
|
+
if self.is_saved(best['url']):
|
|
152
|
+
return
|
|
153
|
+
r = self.session.head(best['url'])
|
|
154
|
+
if r.status_code != HTTPStatus.OK:
|
|
155
|
+
log.warning('HEAD request failed with status code %s.', r.status_code)
|
|
121
156
|
return
|
|
122
|
-
r = self._session.head(best['url'])
|
|
123
|
-
r.raise_for_status()
|
|
124
157
|
ext = get_extension(r.headers['content-type'])
|
|
125
158
|
name = f'{sub_item["id"]}.{ext}'
|
|
126
|
-
with open(
|
|
127
|
-
|
|
128
|
-
stream=True).iter_content(chunk_size=512)):
|
|
129
|
-
f.write(content)
|
|
159
|
+
with Path(name).open('wb') as f:
|
|
160
|
+
f.writelines(self.session.get(best['url'], stream=True).iter_content(chunk_size=512))
|
|
130
161
|
utime(name, (timestamp, timestamp))
|
|
131
|
-
self.
|
|
132
|
-
|
|
133
|
-
def
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
162
|
+
self.save_to_log(r.url)
|
|
163
|
+
|
|
164
|
+
def save_comments(self, edge: Edge) -> None:
|
|
165
|
+
"""Save comments for an edge node."""
|
|
166
|
+
comment_url = ('https://www.instagram.com/api/v1/media/'
|
|
167
|
+
f'{edge["node"]["id"]}/comments/')
|
|
168
|
+
shared_params = {'can_support_threading': 'true'}
|
|
169
|
+
try:
|
|
170
|
+
comment_data = self.get_json(comment_url,
|
|
171
|
+
params={
|
|
172
|
+
**shared_params, 'permalink_enabled': 'false'
|
|
173
|
+
},
|
|
174
|
+
cast_to=Comments)
|
|
175
|
+
except HTTPError:
|
|
176
|
+
log.exception('Failed to get comments.')
|
|
177
|
+
return
|
|
178
|
+
top_comment_data: Any = comment_data
|
|
179
|
+
while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
|
|
180
|
+
try:
|
|
181
|
+
comment_data = self.get_json(comment_url,
|
|
182
|
+
params={
|
|
183
|
+
**shared_params,
|
|
184
|
+
'min_id':
|
|
185
|
+
comment_data['next_min_id'],
|
|
186
|
+
},
|
|
187
|
+
cast_to=Comments)
|
|
188
|
+
except HTTPError:
|
|
189
|
+
log.exception('Failed to get comments.')
|
|
190
|
+
break
|
|
191
|
+
top_comment_data['comments'] = (list(top_comment_data['comments']) +
|
|
192
|
+
list(comment_data['comments']))
|
|
193
|
+
comments_json = f'{edge["node"]["id"]}-comments.json'
|
|
194
|
+
with Path(comments_json).open('w+', encoding='utf-8') as f:
|
|
195
|
+
json.dump(top_comment_data, f, sort_keys=True, indent=2)
|
|
196
|
+
|
|
197
|
+
def save_media(self, edge: Edge) -> None:
|
|
198
|
+
"""
|
|
199
|
+
Save media for an edge node.
|
|
200
|
+
|
|
201
|
+
Raises
|
|
202
|
+
------
|
|
203
|
+
UnexpectedRedirect
|
|
204
|
+
If a redirect occurs unexpectedly.
|
|
205
|
+
"""
|
|
206
|
+
media_info_url = f'https://www.instagram.com/api/v1/media/{edge["node"]["pk"]}/info/'
|
|
207
|
+
log.info('Saving media at URL: %s', media_info_url)
|
|
208
|
+
if self.is_saved(media_info_url):
|
|
209
|
+
return
|
|
210
|
+
r = self.session.get(media_info_url, headers=API_HEADERS, allow_redirects=False)
|
|
211
|
+
if r.status_code != HTTPStatus.OK:
|
|
212
|
+
if r.status_code in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}:
|
|
213
|
+
raise UnexpectedRedirect
|
|
214
|
+
log.warning('GET request failed with status code %s.', r.status_code)
|
|
215
|
+
log.debug('Content: %s', r.text)
|
|
161
216
|
return
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
217
|
+
if 'image_versions2' not in r.text or 'taken_at' not in r.text:
|
|
218
|
+
log.warning('Invalid response. image_versions2 dict not found.')
|
|
219
|
+
return
|
|
220
|
+
media_info: MediaInfo = r.json()
|
|
166
221
|
timestamp = media_info['items'][0]['taken_at']
|
|
167
222
|
id_json_file = f'{edge["node"]["id"]}.json'
|
|
168
223
|
media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
|
|
@@ -170,141 +225,48 @@ class InstagramClient:
|
|
|
170
225
|
write_if_new(media_info_json_file, str(json_dumps_formatted(media_info)))
|
|
171
226
|
for file in (id_json_file, media_info_json_file):
|
|
172
227
|
utime(file, (timestamp, timestamp))
|
|
173
|
-
self.
|
|
228
|
+
self.save_to_log(media_info_url)
|
|
174
229
|
for item in media_info['items']:
|
|
175
230
|
timestamp = item['taken_at']
|
|
176
|
-
if
|
|
177
|
-
for sub_item in
|
|
178
|
-
self.
|
|
231
|
+
if (carousel_media := item.get('carousel_media')):
|
|
232
|
+
for sub_item in carousel_media:
|
|
233
|
+
self.save_image_versions2(sub_item, timestamp)
|
|
179
234
|
elif 'image_versions2' in item:
|
|
180
|
-
self.
|
|
235
|
+
self.save_image_versions2(item, timestamp)
|
|
181
236
|
|
|
182
|
-
def
|
|
237
|
+
def save_edges(self, edges: Iterable[Edge], parent_edge: Edge | None = None) -> None:
|
|
238
|
+
"""Save edge node media."""
|
|
183
239
|
for edge in edges:
|
|
184
|
-
if edge['node']['__typename'] == '
|
|
240
|
+
if edge['node']['__typename'] == 'XDTMediaDict':
|
|
185
241
|
try:
|
|
186
|
-
shortcode = edge['node']['
|
|
187
|
-
except KeyError
|
|
242
|
+
shortcode = edge['node']['code']
|
|
243
|
+
except KeyError:
|
|
188
244
|
if parent_edge:
|
|
189
245
|
try:
|
|
190
|
-
shortcode = parent_edge['node']['
|
|
191
|
-
except KeyError
|
|
192
|
-
|
|
246
|
+
shortcode = parent_edge['node']['code']
|
|
247
|
+
except KeyError:
|
|
248
|
+
log.exception('Unknown shortcode.')
|
|
249
|
+
return
|
|
193
250
|
else:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
251
|
+
log.exception('Unknown shortcode.')
|
|
252
|
+
if edge['node'].get('video_dash_manifest'):
|
|
253
|
+
self.add_video_url(f'https://www.instagram.com/p/{shortcode}/')
|
|
254
|
+
else:
|
|
255
|
+
try:
|
|
256
|
+
self.save_comments(edge)
|
|
257
|
+
self.save_media(edge)
|
|
258
|
+
except requests.exceptions.RetryError:
|
|
259
|
+
log.exception('Retries exhausted.')
|
|
260
|
+
return
|
|
204
261
|
else:
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
@overload
|
|
212
|
-
def _get_rate_limited(self,
|
|
213
|
-
url: str,
|
|
214
|
-
*,
|
|
215
|
-
return_json: Literal[False] = False) -> requests.Response:
|
|
216
|
-
pass
|
|
217
|
-
|
|
218
|
-
@overload
|
|
219
|
-
def _get_rate_limited(self,
|
|
220
|
-
url: str,
|
|
221
|
-
*,
|
|
222
|
-
params: Mapping[str, str] | None = None,
|
|
223
|
-
cast_to: Type[T]) -> T:
|
|
224
|
-
pass
|
|
225
|
-
|
|
226
|
-
@sleep_and_retry
|
|
227
|
-
@limits(calls=10, period=60)
|
|
228
|
-
def _get_rate_limited(
|
|
229
|
-
self,
|
|
230
|
-
url: str,
|
|
231
|
-
*,
|
|
232
|
-
return_json: bool = True,
|
|
233
|
-
params: Mapping[str, str] | None = None,
|
|
234
|
-
cast_to: Type[T] | None = None) -> T | requests.Response: # pylint: disable=unused-argument
|
|
235
|
-
with self._session.get(url, params=params) as r:
|
|
236
|
-
r.raise_for_status()
|
|
237
|
-
return r.json() if return_json else r
|
|
262
|
+
log.warning( # type: ignore[unreachable]
|
|
263
|
+
'Unknown type: `%s`. Item %s will not be processed.',
|
|
264
|
+
edge['node']['__typename'], edge['node']['id'])
|
|
265
|
+
shortcode = edge['node']['code']
|
|
266
|
+
self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}/')
|
|
238
267
|
|
|
239
|
-
def
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
def __enter__(self) -> 'InstagramClient':
|
|
246
|
-
"""Recommended way to initialise the client."""
|
|
247
|
-
return self
|
|
248
|
-
|
|
249
|
-
def __exit__(self, _: Type[BaseException], __: BaseException, ___: Traceback) -> None:
|
|
250
|
-
"""Clean up."""
|
|
251
|
-
self._cursor.close()
|
|
252
|
-
self._connection.close()
|
|
253
|
-
|
|
254
|
-
def process(self) -> None:
|
|
255
|
-
"""Process posts."""
|
|
256
|
-
with chdir(self._output_dir):
|
|
257
|
-
self._get_rate_limited(f'https://www.instagram.com/{self._username}/',
|
|
258
|
-
return_json=False)
|
|
259
|
-
r = self._get_rate_limited('https://i.instagram.com/api/v1/users/web_profile_info/',
|
|
260
|
-
params={'username': self._username},
|
|
261
|
-
cast_to=WebProfileInfo)
|
|
262
|
-
with open('web_profile_info.json', 'w') as f:
|
|
263
|
-
json.dump(r, f, indent=2, sort_keys=True)
|
|
264
|
-
user_info = r['data']['user']
|
|
265
|
-
if not self._is_saved(user_info['profile_pic_url_hd']):
|
|
266
|
-
with open('profile_pic.jpg', 'wb') as f:
|
|
267
|
-
for chunk in self._session.get(user_info['profile_pic_url_hd'],
|
|
268
|
-
stream=True).iter_content(chunk_size=512):
|
|
269
|
-
f.write(chunk)
|
|
270
|
-
self._save_to_log(user_info['profile_pic_url_hd'])
|
|
271
|
-
for item in self._highlights_tray(user_info['id'])['tray']:
|
|
272
|
-
self._add_video_url('https://www.instagram.com/stories/highlights/'
|
|
273
|
-
f'{item["id"].split(":")[-1]}/')
|
|
274
|
-
self._save_stuff(user_info['edge_owner_to_timeline_media']['edges'])
|
|
275
|
-
page_info = user_info['edge_owner_to_timeline_media']['page_info']
|
|
276
|
-
while page_info['has_next_page']:
|
|
277
|
-
params = dict(query_hash='69cba40317214236af40e7efa697781d',
|
|
278
|
-
variables=json.dumps(
|
|
279
|
-
dict(id=user_info['id'], first=12,
|
|
280
|
-
after=page_info['end_cursor'])))
|
|
281
|
-
media = self._get_rate_limited(
|
|
282
|
-
'https://www.instagram.com/graphql/query/',
|
|
283
|
-
params=params,
|
|
284
|
-
cast_to=WebProfileInfo)['data']['user']['edge_owner_to_timeline_media']
|
|
285
|
-
page_info = media['page_info']
|
|
286
|
-
self._save_stuff(media['edges'])
|
|
287
|
-
if len(self._video_urls) > 0:
|
|
288
|
-
options = deepcopy(SHARED_YT_DLP_OPTIONS)
|
|
289
|
-
options.update({
|
|
290
|
-
'cookiefile': None,
|
|
291
|
-
'cookiesfrombrowser': (self._browser, self._browser_profile),
|
|
292
|
-
'getcomments': self._get_comments,
|
|
293
|
-
'verbose': self._debug
|
|
294
|
-
})
|
|
295
|
-
with yt_dlp.YoutubeDL(options) as ydl:
|
|
296
|
-
failed_urls: list[str] = []
|
|
297
|
-
while (self._video_urls and (url := self._video_urls.pop())):
|
|
298
|
-
if self._is_saved(url):
|
|
299
|
-
logger.debug(f'{url} is already saved')
|
|
300
|
-
continue
|
|
301
|
-
if ydl.extract_info(url):
|
|
302
|
-
logger.debug(f'Extracting {url}')
|
|
303
|
-
self._save_to_log(url)
|
|
304
|
-
else:
|
|
305
|
-
failed_urls.append(url)
|
|
306
|
-
if len(failed_urls) > 0:
|
|
307
|
-
logger.error('Some video URIs failed. Check failed.txt.')
|
|
308
|
-
with open('failed.txt', 'w') as f:
|
|
309
|
-
for url in failed_urls:
|
|
310
|
-
f.write(f'{url}\n')
|
|
268
|
+
def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
|
|
269
|
+
"""Get JSON data from a URL."""
|
|
270
|
+
with self.session.get(url, params=params, headers=API_HEADERS) as r:
|
|
271
|
+
r.raise_for_status()
|
|
272
|
+
return cast('T', r.json())
|