instagram-archiver 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of instagram-archiver might be problematic. Click here for more details.
- instagram_archiver/__init__.py +8 -2
- instagram_archiver/__main__.py +6 -0
- instagram_archiver/client.py +219 -260
- instagram_archiver/constants.py +52 -92
- instagram_archiver/main.py +78 -46
- instagram_archiver/profile_scraper.py +194 -0
- instagram_archiver/py.typed +0 -0
- instagram_archiver/saved_scraper.py +78 -0
- instagram_archiver/typing.py +170 -0
- instagram_archiver/utils.py +98 -74
- instagram_archiver-0.3.0.dist-info/LICENSE.txt +18 -0
- instagram_archiver-0.3.0.dist-info/METADATA +119 -0
- instagram_archiver-0.3.0.dist-info/RECORD +15 -0
- {instagram_archiver-0.2.0.dist-info → instagram_archiver-0.3.0.dist-info}/WHEEL +1 -1
- instagram_archiver-0.3.0.dist-info/entry_points.txt +4 -0
- instagram_archiver/ig_typing.py +0 -117
- instagram_archiver-0.2.0.dist-info/LICENSE.txt +0 -21
- instagram_archiver-0.2.0.dist-info/METADATA +0 -37
- instagram_archiver-0.2.0.dist-info/RECORD +0 -11
- instagram_archiver-0.2.0.dist-info/entry_points.txt +0 -3
instagram_archiver/__init__.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
-
|
|
1
|
+
"""Instagram archiver."""
|
|
2
|
+
from __future__ import annotations
|
|
2
3
|
|
|
3
|
-
|
|
4
|
+
from .client import InstagramClient
|
|
5
|
+
from .profile_scraper import ProfileScraper
|
|
6
|
+
from .saved_scraper import SavedScraper
|
|
7
|
+
|
|
8
|
+
__all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
|
|
9
|
+
__version__ = 'v0.3.0'
|
instagram_archiver/client.py
CHANGED
|
@@ -1,168 +1,219 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
1
|
+
"""Generic client."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from http import HTTPStatus
|
|
5
|
+
from os import utime
|
|
3
6
|
from pathlib import Path
|
|
4
|
-
from
|
|
5
|
-
from typing import Collection, Literal, Mapping, Type, TypeVar, overload
|
|
6
|
-
from urllib.parse import urlparse
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
|
|
7
8
|
import json
|
|
8
|
-
import
|
|
9
|
-
import sqlite3
|
|
9
|
+
import logging
|
|
10
10
|
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from urllib3.util.retry import Retry
|
|
15
|
-
from yt_dlp.cookies import extract_cookies_from_browser
|
|
11
|
+
from bs4 import BeautifulSoup as Soup
|
|
12
|
+
from requests import HTTPError
|
|
13
|
+
from yt_dlp_utils import setup_session
|
|
16
14
|
import requests
|
|
17
|
-
import yt_dlp
|
|
18
15
|
|
|
19
|
-
from .constants import
|
|
20
|
-
from .
|
|
21
|
-
|
|
22
|
-
|
|
16
|
+
from .constants import API_HEADERS, PAGE_FETCH_HEADERS, SHARED_HEADERS
|
|
17
|
+
from .typing import (
|
|
18
|
+
CarouselMedia,
|
|
19
|
+
Comments,
|
|
20
|
+
Edge,
|
|
21
|
+
HighlightsTray,
|
|
22
|
+
MediaInfo,
|
|
23
|
+
MediaInfoItem,
|
|
24
|
+
MediaInfoItemImageVersions2Candidate,
|
|
25
|
+
)
|
|
26
|
+
from .utils import get_extension, json_dumps_formatted, write_if_new
|
|
23
27
|
|
|
24
|
-
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from collections.abc import Iterable, Mapping
|
|
30
|
+
from types import TracebackType
|
|
25
31
|
|
|
26
|
-
|
|
27
|
-
T = TypeVar('T')
|
|
32
|
+
from .typing import BrowserName
|
|
28
33
|
|
|
34
|
+
__all__ = ('CSRFTokenNotFound', 'InstagramClient')
|
|
29
35
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
return f'https://{parsed.netloc}{parsed.path}'
|
|
36
|
+
T = TypeVar('T')
|
|
37
|
+
log = logging.getLogger(__name__)
|
|
33
38
|
|
|
34
39
|
|
|
35
|
-
class
|
|
36
|
-
|
|
40
|
+
class CSRFTokenNotFound(RuntimeError):
|
|
41
|
+
"""CSRF token not found in cookies."""
|
|
37
42
|
|
|
38
43
|
|
|
39
44
|
class InstagramClient:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
output_dir: str | None = None,
|
|
45
|
-
disable_log: bool = False,
|
|
46
|
-
browser: Browser = 'chrome',
|
|
47
|
-
browser_profile: str = 'Default',
|
|
48
|
-
debug: bool = False,
|
|
49
|
-
comments: bool = False) -> None:
|
|
50
|
-
self._no_log = disable_log
|
|
51
|
-
self._session = requests.Session()
|
|
52
|
-
self._browser = browser
|
|
53
|
-
self._browser_profile = browser_profile
|
|
54
|
-
self._setup_session(browser, browser_profile)
|
|
55
|
-
self._output_dir = Path(output_dir or Path('.').resolve() / username)
|
|
56
|
-
makedirs(self._output_dir, exist_ok=True)
|
|
57
|
-
self._log_db = Path(log_file or self._output_dir / '.log.db')
|
|
58
|
-
self._connection = sqlite3.connect(self._log_db)
|
|
59
|
-
self._cursor = self._connection.cursor()
|
|
60
|
-
self._setup_db()
|
|
61
|
-
self._username = username
|
|
62
|
-
self._video_urls: list[str] = []
|
|
63
|
-
self._debug = debug
|
|
64
|
-
self._get_comments = comments
|
|
45
|
+
"""Generic client for Instagram."""
|
|
46
|
+
def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
|
|
47
|
+
"""
|
|
48
|
+
Initialise the client.
|
|
65
49
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
browser : str
|
|
53
|
+
The browser to use.
|
|
69
54
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
55
|
+
browser_profile : str
|
|
56
|
+
The browser profile to use.
|
|
57
|
+
"""
|
|
58
|
+
self.session = setup_session(browser,
|
|
59
|
+
browser_profile,
|
|
60
|
+
SHARED_HEADERS,
|
|
61
|
+
domains={'instagram.com'},
|
|
62
|
+
setup_retry=True,
|
|
63
|
+
status_forcelist=(413, 429, 500, 502, 503, 504))
|
|
64
|
+
self.failed_urls: set[str] = set()
|
|
65
|
+
"""Set of failed URLs."""
|
|
66
|
+
self.video_urls: list[str] = []
|
|
67
|
+
"""List of video URLs to download."""
|
|
75
68
|
|
|
76
|
-
def
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
self._session.mount(
|
|
81
|
-
'https://',
|
|
82
|
-
HTTPAdapter(max_retries=Retry(
|
|
83
|
-
backoff_factor=1.5, # wait times are normally 1 and 3 seconds
|
|
84
|
-
redirect=0,
|
|
85
|
-
status=0,
|
|
86
|
-
respect_retry_after_header=False,
|
|
87
|
-
status_forcelist=frozenset((413, 429, 500, 502, 503, 504)),
|
|
88
|
-
total=RETRY_ABORT_NUM)))
|
|
89
|
-
self._session.headers.update({
|
|
90
|
-
**SHARED_HEADERS,
|
|
91
|
-
**dict(cookie='; '.join(f'{cookie.name}={cookie.value}' \
|
|
92
|
-
for cookie in extract_cookies_from_browser(browser, browser_profile)
|
|
93
|
-
if 'instagram.com' in cookie.domain))
|
|
94
|
-
})
|
|
95
|
-
r = self._get_rate_limited('https://www.instagram.com', return_json=False)
|
|
96
|
-
m = re.search(r'"config":{"csrf_token":"([^"]+)"', r.text)
|
|
97
|
-
assert m is not None
|
|
98
|
-
self._session.headers.update({'x-csrftoken': m.group(1)})
|
|
69
|
+
def add_video_url(self, url: str) -> None:
|
|
70
|
+
"""Add a video URL to the list of video URLs."""
|
|
71
|
+
log.info('Added video URL: %s', url)
|
|
72
|
+
self.video_urls.append(url)
|
|
99
73
|
|
|
100
|
-
def
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
74
|
+
def add_csrf_token_header(self) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Add CSRF token header to the session.
|
|
77
|
+
|
|
78
|
+
Raises
|
|
79
|
+
------
|
|
80
|
+
CSRFTokenNotFound
|
|
81
|
+
If the CSRF token is not found in the cookies.
|
|
82
|
+
"""
|
|
83
|
+
token = self.session.cookies.get('csrftoken')
|
|
84
|
+
if not token:
|
|
85
|
+
raise CSRFTokenNotFound
|
|
86
|
+
self.session.headers.update({'x-csrftoken': token})
|
|
87
|
+
|
|
88
|
+
def graphql_query(self,
|
|
89
|
+
variables: Mapping[str, Any],
|
|
90
|
+
*,
|
|
91
|
+
cast_to: type[T],
|
|
92
|
+
doc_id: str = '9806959572732215') -> T | None:
|
|
93
|
+
"""Make a GraphQL query."""
|
|
94
|
+
with self.session.post('https://www.instagram.com/graphql/query',
|
|
95
|
+
headers={
|
|
96
|
+
'content-type': 'application/x-www-form-urlencoded',
|
|
97
|
+
} | API_HEADERS,
|
|
98
|
+
data={
|
|
99
|
+
'doc_id': doc_id,
|
|
100
|
+
'variables': json.dumps(variables, separators=(',', ':'))
|
|
101
|
+
}) as r:
|
|
102
|
+
if r.status_code != HTTPStatus.OK:
|
|
103
|
+
return None
|
|
104
|
+
data = r.json()
|
|
105
|
+
assert isinstance(data, dict)
|
|
106
|
+
if (status := data.get('status')) != 'ok':
|
|
107
|
+
log.error('GraphQL status not "ok": %s', status)
|
|
108
|
+
return None
|
|
109
|
+
if data.get('errors'):
|
|
110
|
+
log.warning('Response has errors.')
|
|
111
|
+
log.debug('Response: %s', json.dumps(data, indent=2))
|
|
112
|
+
if not data.get('data'):
|
|
113
|
+
log.error('No data in response.')
|
|
114
|
+
return cast('T', data['data'])
|
|
115
|
+
|
|
116
|
+
def get_text(self, url: str, *, params: Mapping[str, str] | None = None) -> str:
|
|
117
|
+
"""Get text from a URL."""
|
|
118
|
+
with self.session.get(url, params=params, headers=API_HEADERS) as r:
|
|
119
|
+
r.raise_for_status()
|
|
120
|
+
return r.text
|
|
121
|
+
|
|
122
|
+
def highlights_tray(self, user_id: int | str) -> HighlightsTray:
|
|
123
|
+
"""Get the highlights tray data for a user."""
|
|
124
|
+
return self.get_json(
|
|
125
|
+
f'https://i.instagram.com/api/v1/highlights/{user_id}/highlights_tray/',
|
|
126
|
+
cast_to=HighlightsTray)
|
|
127
|
+
|
|
128
|
+
def __enter__(self) -> Self: # pragma: no cover
|
|
129
|
+
"""Recommended way to initialise the client."""
|
|
130
|
+
return self
|
|
131
|
+
|
|
132
|
+
def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
|
|
133
|
+
___: TracebackType | None) -> None:
|
|
134
|
+
"""Clean up."""
|
|
135
|
+
|
|
136
|
+
def is_saved(self, url: str) -> bool: # pragma: no cover
|
|
137
|
+
"""Check if a URL is already saved."""
|
|
138
|
+
return False
|
|
105
139
|
|
|
106
|
-
def
|
|
107
|
-
|
|
108
|
-
return False
|
|
109
|
-
self._cursor.execute('SELECT COUNT(url) FROM log WHERE url = ?', (_clean_url(url),))
|
|
110
|
-
count: int
|
|
111
|
-
count, = self._cursor.fetchone()
|
|
112
|
-
return count == 1
|
|
140
|
+
def save_to_log(self, url: str) -> None:
|
|
141
|
+
"""Save a URL to the log."""
|
|
113
142
|
|
|
114
|
-
def
|
|
115
|
-
|
|
143
|
+
def save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem, timestamp: int) -> None:
|
|
144
|
+
"""Save images in the image_versions2 dictionary."""
|
|
116
145
|
def key(x: MediaInfoItemImageVersions2Candidate) -> int:
|
|
117
146
|
return x['width'] * x['height']
|
|
118
147
|
|
|
119
|
-
best =
|
|
120
|
-
if self.
|
|
148
|
+
best = max(sub_item['image_versions2']['candidates'], key=key)
|
|
149
|
+
if self.is_saved(best['url']):
|
|
150
|
+
return
|
|
151
|
+
r = self.session.head(best['url'])
|
|
152
|
+
if r.status_code != HTTPStatus.OK:
|
|
153
|
+
log.warning('HEAD request failed with status code %s.', r.status_code)
|
|
121
154
|
return
|
|
122
|
-
r = self._session.head(best['url'])
|
|
123
|
-
r.raise_for_status()
|
|
124
155
|
ext = get_extension(r.headers['content-type'])
|
|
125
156
|
name = f'{sub_item["id"]}.{ext}'
|
|
126
|
-
with open(
|
|
127
|
-
|
|
128
|
-
stream=True).iter_content(chunk_size=512)):
|
|
129
|
-
f.write(content)
|
|
157
|
+
with Path(name).open('wb') as f:
|
|
158
|
+
f.writelines(self.session.get(best['url'], stream=True).iter_content(chunk_size=512))
|
|
130
159
|
utime(name, (timestamp, timestamp))
|
|
131
|
-
self.
|
|
160
|
+
self.save_to_log(r.url)
|
|
132
161
|
|
|
133
|
-
def
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
162
|
+
def save_comments(self, edge: Edge) -> None:
|
|
163
|
+
"""Save comments for an edge node."""
|
|
164
|
+
comment_url = ('https://www.instagram.com/api/v1/media/'
|
|
165
|
+
f'{edge["node"]["id"]}/comments/')
|
|
166
|
+
shared_params = {'can_support_threading': 'true'}
|
|
167
|
+
try:
|
|
168
|
+
comment_data = self.get_json(comment_url,
|
|
169
|
+
params={
|
|
170
|
+
**shared_params, 'permalink_enabled': 'false'
|
|
171
|
+
},
|
|
172
|
+
cast_to=Comments)
|
|
173
|
+
except HTTPError:
|
|
174
|
+
log.exception('Failed to get comments.')
|
|
175
|
+
return
|
|
176
|
+
top_comment_data: Any = comment_data
|
|
177
|
+
while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
|
|
178
|
+
try:
|
|
179
|
+
comment_data = self.get_json(comment_url,
|
|
180
|
+
params={
|
|
181
|
+
**shared_params,
|
|
182
|
+
'min_id':
|
|
183
|
+
comment_data['next_min_id'],
|
|
184
|
+
},
|
|
185
|
+
cast_to=Comments)
|
|
186
|
+
except HTTPError:
|
|
187
|
+
log.exception('Failed to get comments.')
|
|
188
|
+
break
|
|
189
|
+
top_comment_data['comments'] = (list(top_comment_data['comments']) +
|
|
190
|
+
list(comment_data['comments']))
|
|
191
|
+
comments_json = f'{edge["node"]["id"]}-comments.json'
|
|
192
|
+
with Path(comments_json).open('w+', encoding='utf-8') as f:
|
|
193
|
+
json.dump(top_comment_data, f, sort_keys=True, indent=2)
|
|
156
194
|
|
|
157
|
-
def
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
195
|
+
def save_media(self, edge: Edge) -> None:
|
|
196
|
+
"""Save media for an edge node."""
|
|
197
|
+
log.info('Saving media at URL: https://www.instagram.com/p/%s', edge['node']['code'])
|
|
198
|
+
media_info_url = f'https://www.instagram.com/p/{edge["node"]["code"]}/'
|
|
199
|
+
if self.is_saved(media_info_url):
|
|
200
|
+
return
|
|
201
|
+
r = self.session.get(media_info_url, headers=PAGE_FETCH_HEADERS)
|
|
202
|
+
if r.status_code != HTTPStatus.OK:
|
|
203
|
+
log.warning('GET request failed with status code %s.', r.status_code)
|
|
204
|
+
return
|
|
205
|
+
if 'image_versions2' not in r.text or 'taken_at' not in r.text:
|
|
206
|
+
log.warning('Invalid response. image_versions2 dict not found.')
|
|
161
207
|
return
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
208
|
+
soup = Soup(r.text, 'html5lib')
|
|
209
|
+
media_info_embedded = next(
|
|
210
|
+
json.loads(s) for s in (''.join(
|
|
211
|
+
getattr(c, 'text', '') for c in getattr(script, 'contents', ''))
|
|
212
|
+
for script in soup.select('script[type="application/json"]'))
|
|
213
|
+
if 'image_versions2' in s and 'taken_at' in s)
|
|
214
|
+
media_info: MediaInfo = (
|
|
215
|
+
media_info_embedded['require'][0][3][0]['__bbox']['require'][0][3][1]['__bbox']
|
|
216
|
+
['result']['data']['xdt_api__v1__media__shortcode__web_info'])
|
|
166
217
|
timestamp = media_info['items'][0]['taken_at']
|
|
167
218
|
id_json_file = f'{edge["node"]["id"]}.json'
|
|
168
219
|
media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
|
|
@@ -170,140 +221,48 @@ class InstagramClient:
|
|
|
170
221
|
write_if_new(media_info_json_file, str(json_dumps_formatted(media_info)))
|
|
171
222
|
for file in (id_json_file, media_info_json_file):
|
|
172
223
|
utime(file, (timestamp, timestamp))
|
|
173
|
-
self.
|
|
224
|
+
self.save_to_log(media_info_url)
|
|
174
225
|
for item in media_info['items']:
|
|
175
226
|
timestamp = item['taken_at']
|
|
176
|
-
if
|
|
177
|
-
for sub_item in
|
|
178
|
-
self.
|
|
227
|
+
if (carousel_media := item.get('carousel_media')):
|
|
228
|
+
for sub_item in carousel_media:
|
|
229
|
+
self.save_image_versions2(sub_item, timestamp)
|
|
179
230
|
elif 'image_versions2' in item:
|
|
180
|
-
self.
|
|
231
|
+
self.save_image_versions2(item, timestamp)
|
|
181
232
|
|
|
182
|
-
def
|
|
233
|
+
def save_edges(self, edges: Iterable[Edge], parent_edge: Edge | None = None) -> None:
|
|
234
|
+
"""Save edge node media."""
|
|
183
235
|
for edge in edges:
|
|
184
|
-
if edge['node']['__typename'] == '
|
|
236
|
+
if edge['node']['__typename'] == 'XDTMediaDict':
|
|
185
237
|
try:
|
|
186
|
-
shortcode = edge['node']['
|
|
187
|
-
except KeyError
|
|
238
|
+
shortcode = edge['node']['code']
|
|
239
|
+
except KeyError:
|
|
188
240
|
if parent_edge:
|
|
189
241
|
try:
|
|
190
|
-
shortcode = parent_edge['node']['
|
|
191
|
-
except KeyError
|
|
192
|
-
|
|
242
|
+
shortcode = parent_edge['node']['code']
|
|
243
|
+
except KeyError:
|
|
244
|
+
log.exception('Unknown shortcode.')
|
|
245
|
+
return
|
|
193
246
|
else:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
247
|
+
log.exception('Unknown shortcode.')
|
|
248
|
+
if edge['node'].get('video_dash_manifest'):
|
|
249
|
+
self.add_video_url(f'https://www.instagram.com/p/{shortcode}')
|
|
250
|
+
else:
|
|
251
|
+
try:
|
|
252
|
+
self.save_comments(edge)
|
|
253
|
+
self.save_media(edge)
|
|
254
|
+
except requests.exceptions.RetryError:
|
|
255
|
+
log.exception('Retries exhausted.')
|
|
256
|
+
return
|
|
204
257
|
else:
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
@overload
|
|
212
|
-
def _get_rate_limited(self,
|
|
213
|
-
url: str,
|
|
214
|
-
*,
|
|
215
|
-
return_json: Literal[False] = False) -> requests.Response:
|
|
216
|
-
pass
|
|
217
|
-
|
|
218
|
-
@overload
|
|
219
|
-
def _get_rate_limited(self,
|
|
220
|
-
url: str,
|
|
221
|
-
*,
|
|
222
|
-
params: Mapping[str, str] | None = None,
|
|
223
|
-
cast_to: Type[T]) -> T:
|
|
224
|
-
pass
|
|
258
|
+
log.warning( # type: ignore[unreachable]
|
|
259
|
+
'Unknown type: `%s`. Item %s will not be processed.',
|
|
260
|
+
edge['node']['__typename'], edge['node']['id'])
|
|
261
|
+
shortcode = edge['node']['code']
|
|
262
|
+
self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}')
|
|
225
263
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
self,
|
|
230
|
-
url: str,
|
|
231
|
-
*,
|
|
232
|
-
return_json: bool = True,
|
|
233
|
-
params: Mapping[str, str] | None = None,
|
|
234
|
-
cast_to: Type[T] | None = None) -> T | requests.Response: # pylint: disable=unused-argument
|
|
235
|
-
with self._session.get(url, params=params) as r:
|
|
264
|
+
def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
|
|
265
|
+
"""Get JSON data from a URL."""
|
|
266
|
+
with self.session.get(url, params=params, headers=API_HEADERS) as r:
|
|
236
267
|
r.raise_for_status()
|
|
237
|
-
return r.json()
|
|
238
|
-
|
|
239
|
-
def _highlights_tray(self, user_id: int | str) -> HighlightsTray:
|
|
240
|
-
return self._get_rate_limited(
|
|
241
|
-
f'https://i.instagram.com/api/v1/highlights/{user_id}/'
|
|
242
|
-
'highlights_tray/',
|
|
243
|
-
cast_to=HighlightsTray)
|
|
244
|
-
|
|
245
|
-
def __enter__(self) -> 'InstagramClient':
|
|
246
|
-
return self
|
|
247
|
-
|
|
248
|
-
def __exit__(self, _: Type[BaseException], __: BaseException, ___: Traceback) -> None:
|
|
249
|
-
self._cursor.close()
|
|
250
|
-
self._connection.close()
|
|
251
|
-
|
|
252
|
-
def process(self) -> None:
|
|
253
|
-
with chdir(self._output_dir):
|
|
254
|
-
self._get_rate_limited(f'https://www.instagram.com/{self._username}/',
|
|
255
|
-
return_json=False)
|
|
256
|
-
r = self._get_rate_limited('https://i.instagram.com/api/v1/users/web_profile_info/',
|
|
257
|
-
params={'username': self._username},
|
|
258
|
-
cast_to=WebProfileInfo)
|
|
259
|
-
with open('web_profile_info.json', 'w') as f:
|
|
260
|
-
json.dump(r, f, indent=2, sort_keys=True)
|
|
261
|
-
user_info = r['data']['user']
|
|
262
|
-
if not self._is_saved(user_info['profile_pic_url_hd']):
|
|
263
|
-
with open('profile_pic.jpg', 'wb') as f:
|
|
264
|
-
for chunk in self._session.get(user_info['profile_pic_url_hd'],
|
|
265
|
-
stream=True).iter_content(chunk_size=512):
|
|
266
|
-
f.write(chunk)
|
|
267
|
-
self._save_to_log(user_info['profile_pic_url_hd'])
|
|
268
|
-
for item in self._highlights_tray(user_info['id'])['tray']:
|
|
269
|
-
self._add_video_url('https://www.instagram.com/stories/highlights/'
|
|
270
|
-
f'{item["id"].split(":")[-1]}/')
|
|
271
|
-
self._save_stuff(user_info['edge_owner_to_timeline_media']['edges'])
|
|
272
|
-
page_info = user_info['edge_owner_to_timeline_media']['page_info']
|
|
273
|
-
while page_info['has_next_page']:
|
|
274
|
-
params = dict(query_hash='69cba40317214236af40e7efa697781d',
|
|
275
|
-
variables=json.dumps(
|
|
276
|
-
dict(id=user_info['id'], first=12,
|
|
277
|
-
after=page_info['end_cursor'])))
|
|
278
|
-
media = self._get_rate_limited(
|
|
279
|
-
'https://www.instagram.com/graphql/query/',
|
|
280
|
-
params=params,
|
|
281
|
-
cast_to=WebProfileInfo)['data']['user']['edge_owner_to_timeline_media']
|
|
282
|
-
page_info = media['page_info']
|
|
283
|
-
self._save_stuff(media['edges'])
|
|
284
|
-
if len(self._video_urls) > 0:
|
|
285
|
-
with yt_dlp.YoutubeDL({
|
|
286
|
-
**SHARED_YT_DLP_OPTIONS, # type: ignore[misc]
|
|
287
|
-
**{
|
|
288
|
-
'cookiesfrombrowser': [
|
|
289
|
-
self._browser, self._browser_profile, None, None
|
|
290
|
-
],
|
|
291
|
-
'getcomments': self._get_comments,
|
|
292
|
-
'verbose': self._debug
|
|
293
|
-
}
|
|
294
|
-
}) as ydl:
|
|
295
|
-
failed_urls: list[str] = []
|
|
296
|
-
while (self._video_urls and (url := self._video_urls.pop())):
|
|
297
|
-
if self._is_saved(url):
|
|
298
|
-
logger.debug(f'{url} is already saved')
|
|
299
|
-
continue
|
|
300
|
-
if ydl.extract_info(url):
|
|
301
|
-
logger.debug(f'Extracting {url}')
|
|
302
|
-
self._save_to_log(url)
|
|
303
|
-
else:
|
|
304
|
-
failed_urls.append(url)
|
|
305
|
-
if len(failed_urls) > 0:
|
|
306
|
-
logger.error('Some video URIs failed. Check failed.txt.')
|
|
307
|
-
with open('failed.txt', 'w') as f:
|
|
308
|
-
for url in failed_urls:
|
|
309
|
-
f.write(f'{url}\n')
|
|
268
|
+
return cast('T', r.json())
|