instagram-archiver 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of instagram-archiver might be problematic. Click here for more details.

@@ -1,3 +1,9 @@
1
- from .main import main as instagram_archiver
1
+ """Instagram archiver."""
2
+ from __future__ import annotations
2
3
 
3
- __all__ = ('instagram_archiver',)
4
+ from .client import InstagramClient
5
+ from .profile_scraper import ProfileScraper
6
+ from .saved_scraper import SavedScraper
7
+
8
+ __all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
9
+ __version__ = 'v0.3.1'
@@ -1,3 +1,6 @@
1
+ """Entry point for ``python -m`` invocation."""
2
+ from __future__ import annotations
3
+
1
4
  from .main import main
2
5
 
3
- main() # pylint: disable=no-value-for-parameter
6
+ main()
@@ -1,168 +1,223 @@
1
- from copy import deepcopy
2
- from inspect import Traceback
3
- from os import makedirs, utime
1
+ """Generic client."""
2
+ from __future__ import annotations
3
+
4
+ from http import HTTPStatus
5
+ from os import utime
4
6
  from pathlib import Path
5
- from pprint import pprint as pp
6
- from typing import Collection, Literal, Mapping, Type, TypeVar, overload
7
- from urllib.parse import urlparse
7
+ from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
8
8
  import json
9
- import re
10
- import sqlite3
11
-
12
- from loguru import logger
13
- from ratelimit import limits, sleep_and_retry
14
- from requests.adapters import HTTPAdapter
15
- from urllib3.util.retry import Retry
16
- from yt_dlp.cookies import extract_cookies_from_browser
9
+ import logging
10
+
11
+ from requests import HTTPError
12
+ from yt_dlp_utils import setup_session
17
13
  import requests
18
- import yt_dlp
19
14
 
20
- from .constants import LOG_SCHEMA, RETRY_ABORT_NUM, SHARED_HEADERS, SHARED_YT_DLP_OPTIONS
21
- from .ig_typing import (BrowserName, CarouselMedia, Comments, Edge, HighlightsTray, MediaInfo,
22
- MediaInfoItem, MediaInfoItemImageVersions2Candidate, WebProfileInfo)
23
- from .utils import chdir, get_extension, json_dumps_formatted, write_if_new
15
+ from .constants import API_HEADERS, SHARED_HEADERS
16
+ from .typing import (
17
+ CarouselMedia,
18
+ Comments,
19
+ Edge,
20
+ HighlightsTray,
21
+ MediaInfo,
22
+ MediaInfoItem,
23
+ MediaInfoItemImageVersions2Candidate,
24
+ )
25
+ from .utils import get_extension, json_dumps_formatted, write_if_new
26
+
27
+ if TYPE_CHECKING:
28
+ from collections.abc import Iterable, Mapping
29
+ from types import TracebackType
24
30
 
25
- __all__ = ('InstagramClient',)
31
+ from .typing import BrowserName
32
+
33
+ __all__ = ('CSRFTokenNotFound', 'InstagramClient', 'UnexpectedRedirect')
26
34
 
27
35
  T = TypeVar('T')
36
+ log = logging.getLogger(__name__)
28
37
 
29
38
 
30
- def _clean_url(url: str) -> str:
31
- parsed = urlparse(url)
32
- return f'https://{parsed.netloc}{parsed.path}'
39
+ class CSRFTokenNotFound(RuntimeError):
40
+ """CSRF token not found in cookies."""
33
41
 
34
42
 
35
- class AuthenticationError(Exception):
36
- pass
43
+ class UnexpectedRedirect(RuntimeError):
44
+ """Unexpected redirect in a request."""
37
45
 
38
46
 
39
47
  class InstagramClient:
40
- """The client."""
41
- def __init__(self,
42
- *,
43
- username: str,
44
- log_file: str | Path | None = None,
45
- output_dir: str | Path | None = None,
46
- disable_log: bool = False,
47
- browser: BrowserName = 'chrome',
48
- browser_profile: str = 'Default',
49
- debug: bool = False,
50
- comments: bool = False) -> None:
51
- self._no_log = disable_log
52
- self._session = requests.Session()
53
- self._browser = browser
54
- self._browser_profile = browser_profile
55
- self._setup_session(browser, browser_profile)
56
- self._output_dir = Path(output_dir or Path('.').resolve() / username)
57
- makedirs(self._output_dir, exist_ok=True)
58
- self._log_db = Path(log_file or self._output_dir / '.log.db')
59
- self._connection = sqlite3.connect(self._log_db)
60
- self._cursor = self._connection.cursor()
61
- self._setup_db()
62
- self._username = username
63
- self._video_urls: list[str] = []
64
- self._debug = debug
65
- self._get_comments = comments
66
-
67
- def _add_video_url(self, url: str) -> None:
68
- logger.debug(f'Added video URL: {url}')
69
- self._video_urls.append(url)
70
-
71
- def _setup_db(self) -> None:
72
- existed = self._log_db.exists()
73
- if not existed or (existed and self._log_db.stat().st_size == 0):
74
- logger.debug('Creating schema')
75
- self._cursor.execute(LOG_SCHEMA)
76
-
77
- def _setup_session(self,
78
- browser: BrowserName = 'chrome',
79
- browser_profile: str = 'Default') -> None:
80
- self._session.mount(
81
- 'https://',
82
- HTTPAdapter(max_retries=Retry(
83
- backoff_factor=1.5, # wait times are normally 1 and 3 seconds
84
- redirect=0,
85
- status=0,
86
- respect_retry_after_header=False,
87
- status_forcelist=frozenset((413, 429, 500, 502, 503, 504)),
88
- total=RETRY_ABORT_NUM)))
89
- self._session.headers.update({
90
- **SHARED_HEADERS,
91
- **dict(cookie='; '.join(f'{cookie.name}={cookie.value}' \
92
- for cookie in extract_cookies_from_browser(browser, browser_profile)
93
- if 'instagram.com' in cookie.domain))
94
- })
95
- r = self._get_rate_limited('https://www.instagram.com', return_json=False)
96
- m = re.search(r'"config":{"csrf_token":"([^"]+)"', r.text)
97
- assert m is not None
98
- self._session.headers.update({'x-csrftoken': m.group(1)})
99
-
100
- def _save_to_log(self, url: str) -> None:
101
- if self._no_log:
102
- return
103
- self._cursor.execute('INSERT INTO log (url) VALUES (?)', (_clean_url(url),))
104
- self._connection.commit()
105
-
106
- def _is_saved(self, url: str) -> bool:
107
- if self._no_log:
108
- return False
109
- self._cursor.execute('SELECT COUNT(url) FROM log WHERE url = ?', (_clean_url(url),))
110
- count: int
111
- count, = self._cursor.fetchone()
112
- return count == 1
113
-
114
- def _save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem,
115
- timestamp: int) -> None:
48
+ """Generic client for Instagram."""
49
+ def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
50
+ """
51
+ Initialise the client.
52
+
53
+ Parameters
54
+ ----------
55
+ browser : str
56
+ The browser to use.
57
+
58
+ browser_profile : str
59
+ The browser profile to use.
60
+ """
61
+ self.session = setup_session(browser,
62
+ browser_profile,
63
+ SHARED_HEADERS,
64
+ domains={'instagram.com'},
65
+ status_forcelist=(413, 429, 500, 502, 503, 504))
66
+ self.failed_urls: set[str] = set()
67
+ """Set of failed URLs."""
68
+ self.video_urls: list[str] = []
69
+ """List of video URLs to download."""
70
+
71
+ def add_video_url(self, url: str) -> None:
72
+ """Add a video URL to the list of video URLs."""
73
+ log.info('Added video URL: %s', url)
74
+ self.video_urls.append(url)
75
+
76
+ def add_csrf_token_header(self) -> None:
77
+ """
78
+ Add CSRF token header to the session.
79
+
80
+ Raises
81
+ ------
82
+ CSRFTokenNotFound
83
+ If the CSRF token is not found in the cookies.
84
+ """
85
+ token = self.session.cookies.get('csrftoken')
86
+ if not token:
87
+ raise CSRFTokenNotFound
88
+ self.session.headers.update({'x-csrftoken': token})
89
+
90
+ def graphql_query(self,
91
+ variables: Mapping[str, Any],
92
+ *,
93
+ cast_to: type[T],
94
+ doc_id: str = '9806959572732215') -> T | None:
95
+ """Make a GraphQL query."""
96
+ with self.session.post('https://www.instagram.com/graphql/query',
97
+ headers={
98
+ 'content-type': 'application/x-www-form-urlencoded',
99
+ } | API_HEADERS,
100
+ data={
101
+ 'doc_id': doc_id,
102
+ 'variables': json.dumps(variables, separators=(',', ':'))
103
+ }) as r:
104
+ if r.status_code != HTTPStatus.OK:
105
+ return None
106
+ data = r.json()
107
+ assert isinstance(data, dict)
108
+ if (status := data.get('status')) != 'ok':
109
+ log.error('GraphQL status not "ok": %s', status)
110
+ return None
111
+ if data.get('errors'):
112
+ log.warning('Response has errors.')
113
+ log.debug('Response: %s', json.dumps(data, indent=2))
114
+ if not data.get('data'):
115
+ log.error('No data in response.')
116
+ return cast('T', data['data'])
117
+
118
+ def get_text(self, url: str, *, params: Mapping[str, str] | None = None) -> str:
119
+ """Get text from a URL."""
120
+ with self.session.get(url, params=params, headers=API_HEADERS) as r:
121
+ r.raise_for_status()
122
+ return r.text
123
+
124
+ def highlights_tray(self, user_id: int | str) -> HighlightsTray:
125
+ """Get the highlights tray data for a user."""
126
+ return self.get_json(
127
+ f'https://i.instagram.com/api/v1/highlights/{user_id}/highlights_tray/',
128
+ cast_to=HighlightsTray)
129
+
130
+ def __enter__(self) -> Self: # pragma: no cover
131
+ """Recommended way to initialise the client."""
132
+ return self
133
+
134
+ def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
135
+ ___: TracebackType | None) -> None:
136
+ """Clean up."""
137
+
138
+ def is_saved(self, url: str) -> bool: # pragma: no cover
139
+ """Check if a URL is already saved."""
140
+ return False
141
+
142
+ def save_to_log(self, url: str) -> None:
143
+ """Save a URL to the log."""
144
+
145
+ def save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem, timestamp: int) -> None:
146
+ """Save images in the image_versions2 dictionary."""
116
147
  def key(x: MediaInfoItemImageVersions2Candidate) -> int:
117
148
  return x['width'] * x['height']
118
149
 
119
- best = sorted(sub_item['image_versions2']['candidates'], key=key, reverse=True)[0]
120
- if self._is_saved(best['url']):
150
+ best = max(sub_item['image_versions2']['candidates'], key=key)
151
+ if self.is_saved(best['url']):
152
+ return
153
+ r = self.session.head(best['url'])
154
+ if r.status_code != HTTPStatus.OK:
155
+ log.warning('HEAD request failed with status code %s.', r.status_code)
121
156
  return
122
- r = self._session.head(best['url'])
123
- r.raise_for_status()
124
157
  ext = get_extension(r.headers['content-type'])
125
158
  name = f'{sub_item["id"]}.{ext}'
126
- with open(name, 'wb') as f:
127
- for content in (self._session.get(best['url'],
128
- stream=True).iter_content(chunk_size=512)):
129
- f.write(content)
159
+ with Path(name).open('wb') as f:
160
+ f.writelines(self.session.get(best['url'], stream=True).iter_content(chunk_size=512))
130
161
  utime(name, (timestamp, timestamp))
131
- self._save_to_log(r.url)
132
-
133
- def _save_comments(self, edge: Edge) -> None:
134
- if self._get_comments:
135
- comment_url = ('https://www.instagram.com/api/v1/media/'
136
- f'{edge["node"]["id"]}/comments/')
137
- shared_params = dict(can_support_threading='true')
138
- top_comment_data = comment_data = self._get_rate_limited(
139
- comment_url,
140
- params={
141
- **shared_params, 'permalink_enabled': 'false'
142
- },
143
- cast_to=Comments)
144
- while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
145
- comment_data = self._get_rate_limited(comment_url,
146
- params={
147
- **shared_params,
148
- 'min_id':
149
- comment_data['next_min_id'],
150
- },
151
- cast_to=Comments)
152
- top_comment_data['comments'].extend(comment_data['comments'])
153
- comments_json = f'{edge["node"]["id"]}-comments.json'
154
- with open(comments_json, 'w+') as f:
155
- json.dump(top_comment_data, f, sort_keys=True, indent=2)
156
-
157
- def _save_media(self, edge: Edge) -> None:
158
- media_info_url = ('https://i.instagram.com/api/v1/media/'
159
- f'{edge["node"]["id"]}/info/')
160
- if self._is_saved(media_info_url):
162
+ self.save_to_log(r.url)
163
+
164
+ def save_comments(self, edge: Edge) -> None:
165
+ """Save comments for an edge node."""
166
+ comment_url = ('https://www.instagram.com/api/v1/media/'
167
+ f'{edge["node"]["id"]}/comments/')
168
+ shared_params = {'can_support_threading': 'true'}
169
+ try:
170
+ comment_data = self.get_json(comment_url,
171
+ params={
172
+ **shared_params, 'permalink_enabled': 'false'
173
+ },
174
+ cast_to=Comments)
175
+ except HTTPError:
176
+ log.exception('Failed to get comments.')
177
+ return
178
+ top_comment_data: Any = comment_data
179
+ while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
180
+ try:
181
+ comment_data = self.get_json(comment_url,
182
+ params={
183
+ **shared_params,
184
+ 'min_id':
185
+ comment_data['next_min_id'],
186
+ },
187
+ cast_to=Comments)
188
+ except HTTPError:
189
+ log.exception('Failed to get comments.')
190
+ break
191
+ top_comment_data['comments'] = (list(top_comment_data['comments']) +
192
+ list(comment_data['comments']))
193
+ comments_json = f'{edge["node"]["id"]}-comments.json'
194
+ with Path(comments_json).open('w+', encoding='utf-8') as f:
195
+ json.dump(top_comment_data, f, sort_keys=True, indent=2)
196
+
197
+ def save_media(self, edge: Edge) -> None:
198
+ """
199
+ Save media for an edge node.
200
+
201
+ Raises
202
+ ------
203
+ UnexpectedRedirect
204
+ If a redirect occurs unexpectedly.
205
+ """
206
+ media_info_url = f'https://www.instagram.com/api/v1/media/{edge["node"]["pk"]}/info/'
207
+ log.info('Saving media at URL: %s', media_info_url)
208
+ if self.is_saved(media_info_url):
209
+ return
210
+ r = self.session.get(media_info_url, headers=API_HEADERS, allow_redirects=False)
211
+ if r.status_code != HTTPStatus.OK:
212
+ if r.status_code in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}:
213
+ raise UnexpectedRedirect
214
+ log.warning('GET request failed with status code %s.', r.status_code)
215
+ log.debug('Content: %s', r.text)
161
216
  return
162
- media_info = self._get_rate_limited(media_info_url, cast_to=MediaInfo)
163
- if media_info['more_available'] or media_info['num_results'] != 1:
164
- pp(media_info)
165
- raise ValueError('Unhandled more_available set to True')
217
+ if 'image_versions2' not in r.text or 'taken_at' not in r.text:
218
+ log.warning('Invalid response. image_versions2 dict not found.')
219
+ return
220
+ media_info: MediaInfo = r.json()
166
221
  timestamp = media_info['items'][0]['taken_at']
167
222
  id_json_file = f'{edge["node"]["id"]}.json'
168
223
  media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
@@ -170,141 +225,48 @@ class InstagramClient:
170
225
  write_if_new(media_info_json_file, str(json_dumps_formatted(media_info)))
171
226
  for file in (id_json_file, media_info_json_file):
172
227
  utime(file, (timestamp, timestamp))
173
- self._save_to_log(media_info_url)
228
+ self.save_to_log(media_info_url)
174
229
  for item in media_info['items']:
175
230
  timestamp = item['taken_at']
176
- if 'carousel_media' in item:
177
- for sub_item in item['carousel_media']:
178
- self._save_image_versions2(sub_item, timestamp)
231
+ if (carousel_media := item.get('carousel_media')):
232
+ for sub_item in carousel_media:
233
+ self.save_image_versions2(sub_item, timestamp)
179
234
  elif 'image_versions2' in item:
180
- self._save_image_versions2(item, timestamp)
235
+ self.save_image_versions2(item, timestamp)
181
236
 
182
- def _save_stuff(self, edges: Collection[Edge], parent_edge: Edge | None = None) -> None:
237
+ def save_edges(self, edges: Iterable[Edge], parent_edge: Edge | None = None) -> None:
238
+ """Save edge node media."""
183
239
  for edge in edges:
184
- if edge['node']['__typename'] == 'GraphVideo':
240
+ if edge['node']['__typename'] == 'XDTMediaDict':
185
241
  try:
186
- shortcode = edge['node']['shortcode']
187
- except KeyError as e:
242
+ shortcode = edge['node']['code']
243
+ except KeyError:
188
244
  if parent_edge:
189
245
  try:
190
- shortcode = parent_edge['node']['shortcode']
191
- except KeyError as exc:
192
- raise ValueError('Unknown shortcode') from exc
246
+ shortcode = parent_edge['node']['code']
247
+ except KeyError:
248
+ log.exception('Unknown shortcode.')
249
+ return
193
250
  else:
194
- raise ValueError('Unknown shortcode') from e
195
- self._add_video_url(f'https://www.instagram.com/p/{shortcode}')
196
- elif edge['node']['__typename'] == 'GraphImage':
197
- self._save_media(edge)
198
- elif edge['node']['__typename'] == 'GraphSidecar':
199
- logger.debug('Recursion into child edges')
200
- if (not edge['node']['comments_disabled']
201
- and edge['node']['edge_media_to_comment']['count']):
202
- self._save_comments(edge)
203
- self._save_stuff(edge['node']['edge_sidecar_to_children']['edges'], edge)
251
+ log.exception('Unknown shortcode.')
252
+ if edge['node'].get('video_dash_manifest'):
253
+ self.add_video_url(f'https://www.instagram.com/p/{shortcode}/')
254
+ else:
255
+ try:
256
+ self.save_comments(edge)
257
+ self.save_media(edge)
258
+ except requests.exceptions.RetryError:
259
+ log.exception('Retries exhausted.')
260
+ return
204
261
  else:
205
- raise ValueError(f'Unknown type "{edge["node"]["__typename"]}"')
206
-
207
- @overload
208
- def _get_rate_limited(self, url: str, *, cast_to: Type[T]) -> T:
209
- pass
210
-
211
- @overload
212
- def _get_rate_limited(self,
213
- url: str,
214
- *,
215
- return_json: Literal[False] = False) -> requests.Response:
216
- pass
217
-
218
- @overload
219
- def _get_rate_limited(self,
220
- url: str,
221
- *,
222
- params: Mapping[str, str] | None = None,
223
- cast_to: Type[T]) -> T:
224
- pass
225
-
226
- @sleep_and_retry
227
- @limits(calls=10, period=60)
228
- def _get_rate_limited(
229
- self,
230
- url: str,
231
- *,
232
- return_json: bool = True,
233
- params: Mapping[str, str] | None = None,
234
- cast_to: Type[T] | None = None) -> T | requests.Response: # pylint: disable=unused-argument
235
- with self._session.get(url, params=params) as r:
236
- r.raise_for_status()
237
- return r.json() if return_json else r
262
+ log.warning( # type: ignore[unreachable]
263
+ 'Unknown type: `%s`. Item %s will not be processed.',
264
+ edge['node']['__typename'], edge['node']['id'])
265
+ shortcode = edge['node']['code']
266
+ self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}/')
238
267
 
239
- def _highlights_tray(self, user_id: int | str) -> HighlightsTray:
240
- return self._get_rate_limited(
241
- f'https://i.instagram.com/api/v1/highlights/{user_id}/'
242
- 'highlights_tray/',
243
- cast_to=HighlightsTray)
244
-
245
- def __enter__(self) -> 'InstagramClient':
246
- """Recommended way to initialise the client."""
247
- return self
248
-
249
- def __exit__(self, _: Type[BaseException], __: BaseException, ___: Traceback) -> None:
250
- """Clean up."""
251
- self._cursor.close()
252
- self._connection.close()
253
-
254
- def process(self) -> None:
255
- """Process posts."""
256
- with chdir(self._output_dir):
257
- self._get_rate_limited(f'https://www.instagram.com/{self._username}/',
258
- return_json=False)
259
- r = self._get_rate_limited('https://i.instagram.com/api/v1/users/web_profile_info/',
260
- params={'username': self._username},
261
- cast_to=WebProfileInfo)
262
- with open('web_profile_info.json', 'w') as f:
263
- json.dump(r, f, indent=2, sort_keys=True)
264
- user_info = r['data']['user']
265
- if not self._is_saved(user_info['profile_pic_url_hd']):
266
- with open('profile_pic.jpg', 'wb') as f:
267
- for chunk in self._session.get(user_info['profile_pic_url_hd'],
268
- stream=True).iter_content(chunk_size=512):
269
- f.write(chunk)
270
- self._save_to_log(user_info['profile_pic_url_hd'])
271
- for item in self._highlights_tray(user_info['id'])['tray']:
272
- self._add_video_url('https://www.instagram.com/stories/highlights/'
273
- f'{item["id"].split(":")[-1]}/')
274
- self._save_stuff(user_info['edge_owner_to_timeline_media']['edges'])
275
- page_info = user_info['edge_owner_to_timeline_media']['page_info']
276
- while page_info['has_next_page']:
277
- params = dict(query_hash='69cba40317214236af40e7efa697781d',
278
- variables=json.dumps(
279
- dict(id=user_info['id'], first=12,
280
- after=page_info['end_cursor'])))
281
- media = self._get_rate_limited(
282
- 'https://www.instagram.com/graphql/query/',
283
- params=params,
284
- cast_to=WebProfileInfo)['data']['user']['edge_owner_to_timeline_media']
285
- page_info = media['page_info']
286
- self._save_stuff(media['edges'])
287
- if len(self._video_urls) > 0:
288
- options = deepcopy(SHARED_YT_DLP_OPTIONS)
289
- options.update({
290
- 'cookiefile': None,
291
- 'cookiesfrombrowser': (self._browser, self._browser_profile),
292
- 'getcomments': self._get_comments,
293
- 'verbose': self._debug
294
- })
295
- with yt_dlp.YoutubeDL(options) as ydl:
296
- failed_urls: list[str] = []
297
- while (self._video_urls and (url := self._video_urls.pop())):
298
- if self._is_saved(url):
299
- logger.debug(f'{url} is already saved')
300
- continue
301
- if ydl.extract_info(url):
302
- logger.debug(f'Extracting {url}')
303
- self._save_to_log(url)
304
- else:
305
- failed_urls.append(url)
306
- if len(failed_urls) > 0:
307
- logger.error('Some video URIs failed. Check failed.txt.')
308
- with open('failed.txt', 'w') as f:
309
- for url in failed_urls:
310
- f.write(f'{url}\n')
268
+ def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
269
+ """Get JSON data from a URL."""
270
+ with self.session.get(url, params=params, headers=API_HEADERS) as r:
271
+ r.raise_for_status()
272
+ return cast('T', r.json())