instagram-archiver 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of instagram-archiver might be problematic. Click here for more details.

@@ -1,3 +1,9 @@
1
- from .main import main
1
+ """Instagram archiver."""
2
+ from __future__ import annotations
2
3
 
3
- __all__ = ('main',)
4
+ from .client import InstagramClient
5
+ from .profile_scraper import ProfileScraper
6
+ from .saved_scraper import SavedScraper
7
+
8
+ __all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
9
+ __version__ = 'v0.3.0'
@@ -0,0 +1,6 @@
1
+ """Entry point for ``python -m`` invocation."""
2
+ from __future__ import annotations
3
+
4
+ from .main import main
5
+
6
+ main()
@@ -1,168 +1,219 @@
1
- from inspect import Traceback
2
- from os import makedirs, utime
1
+ """Generic client."""
2
+ from __future__ import annotations
3
+
4
+ from http import HTTPStatus
5
+ from os import utime
3
6
  from pathlib import Path
4
- from pprint import pprint as pp
5
- from typing import Collection, Literal, Mapping, Type, TypeVar, overload
6
- from urllib.parse import urlparse
7
+ from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
7
8
  import json
8
- import re
9
- import sqlite3
9
+ import logging
10
10
 
11
- from loguru import logger
12
- from ratelimit import limits, sleep_and_retry
13
- from requests.adapters import HTTPAdapter
14
- from urllib3.util.retry import Retry
15
- from yt_dlp.cookies import extract_cookies_from_browser
11
+ from bs4 import BeautifulSoup as Soup
12
+ from requests import HTTPError
13
+ from yt_dlp_utils import setup_session
16
14
  import requests
17
- import yt_dlp
18
15
 
19
- from .constants import LOG_SCHEMA, RETRY_ABORT_NUM, SHARED_HEADERS, SHARED_YT_DLP_OPTIONS
20
- from .ig_typing import (CarouselMedia, Comments, Edge, HighlightsTray, MediaInfo, MediaInfoItem,
21
- MediaInfoItemImageVersions2Candidate, WebProfileInfo)
22
- from .utils import chdir, get_extension, json_dumps_formatted, write_if_new
16
+ from .constants import API_HEADERS, PAGE_FETCH_HEADERS, SHARED_HEADERS
17
+ from .typing import (
18
+ CarouselMedia,
19
+ Comments,
20
+ Edge,
21
+ HighlightsTray,
22
+ MediaInfo,
23
+ MediaInfoItem,
24
+ MediaInfoItemImageVersions2Candidate,
25
+ )
26
+ from .utils import get_extension, json_dumps_formatted, write_if_new
23
27
 
24
- __all__ = ('InstagramClient',)
28
+ if TYPE_CHECKING:
29
+ from collections.abc import Iterable, Mapping
30
+ from types import TracebackType
25
31
 
26
- Browser = Literal['brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'firefox', 'safari']
27
- T = TypeVar('T')
32
+ from .typing import BrowserName
28
33
 
34
+ __all__ = ('CSRFTokenNotFound', 'InstagramClient')
29
35
 
30
- def _clean_url(url: str) -> str:
31
- parsed = urlparse(url)
32
- return f'https://{parsed.netloc}{parsed.path}'
36
+ T = TypeVar('T')
37
+ log = logging.getLogger(__name__)
33
38
 
34
39
 
35
- class AuthenticationError(Exception):
36
- pass
40
+ class CSRFTokenNotFound(RuntimeError):
41
+ """CSRF token not found in cookies."""
37
42
 
38
43
 
39
44
  class InstagramClient:
40
- def __init__(self,
41
- *,
42
- username: str,
43
- log_file: str | Path | None = None,
44
- output_dir: str | None = None,
45
- disable_log: bool = False,
46
- browser: Browser = 'chrome',
47
- browser_profile: str = 'Default',
48
- debug: bool = False,
49
- comments: bool = False) -> None:
50
- self._no_log = disable_log
51
- self._session = requests.Session()
52
- self._browser = browser
53
- self._browser_profile = browser_profile
54
- self._setup_session(browser, browser_profile)
55
- self._output_dir = Path(output_dir or Path('.').resolve() / username)
56
- makedirs(self._output_dir, exist_ok=True)
57
- self._log_db = Path(log_file or self._output_dir / '.log.db')
58
- self._connection = sqlite3.connect(self._log_db)
59
- self._cursor = self._connection.cursor()
60
- self._setup_db()
61
- self._username = username
62
- self._video_urls: list[str] = []
63
- self._debug = debug
64
- self._get_comments = comments
45
+ """Generic client for Instagram."""
46
+ def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
47
+ """
48
+ Initialise the client.
65
49
 
66
- def _add_video_url(self, url: str) -> None:
67
- logger.debug(f'Added video URL: {url}')
68
- self._video_urls.append(url)
50
+ Parameters
51
+ ----------
52
+ browser : str
53
+ The browser to use.
69
54
 
70
- def _setup_db(self) -> None:
71
- existed = self._log_db.exists()
72
- if not existed or (existed and self._log_db.stat().st_size == 0):
73
- logger.debug('Creating schema')
74
- self._cursor.execute(LOG_SCHEMA)
55
+ browser_profile : str
56
+ The browser profile to use.
57
+ """
58
+ self.session = setup_session(browser,
59
+ browser_profile,
60
+ SHARED_HEADERS,
61
+ domains={'instagram.com'},
62
+ setup_retry=True,
63
+ status_forcelist=(413, 429, 500, 502, 503, 504))
64
+ self.failed_urls: set[str] = set()
65
+ """Set of failed URLs."""
66
+ self.video_urls: list[str] = []
67
+ """List of video URLs to download."""
75
68
 
76
- def _setup_session(self,
77
- browser: Literal['brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi',
78
- 'firefox', 'safari'] = 'chrome',
79
- browser_profile: str = 'Default') -> None:
80
- self._session.mount(
81
- 'https://',
82
- HTTPAdapter(max_retries=Retry(
83
- backoff_factor=1.5, # wait times are normally 1 and 3 seconds
84
- redirect=0,
85
- status=0,
86
- respect_retry_after_header=False,
87
- status_forcelist=frozenset((413, 429, 500, 502, 503, 504)),
88
- total=RETRY_ABORT_NUM)))
89
- self._session.headers.update({
90
- **SHARED_HEADERS,
91
- **dict(cookie='; '.join(f'{cookie.name}={cookie.value}' \
92
- for cookie in extract_cookies_from_browser(browser, browser_profile)
93
- if 'instagram.com' in cookie.domain))
94
- })
95
- r = self._get_rate_limited('https://www.instagram.com', return_json=False)
96
- m = re.search(r'"config":{"csrf_token":"([^"]+)"', r.text)
97
- assert m is not None
98
- self._session.headers.update({'x-csrftoken': m.group(1)})
69
+ def add_video_url(self, url: str) -> None:
70
+ """Add a video URL to the list of video URLs."""
71
+ log.info('Added video URL: %s', url)
72
+ self.video_urls.append(url)
99
73
 
100
- def _save_to_log(self, url: str) -> None:
101
- if self._no_log:
102
- return
103
- self._cursor.execute('INSERT INTO log (url) VALUES (?)', (_clean_url(url),))
104
- self._connection.commit()
74
+ def add_csrf_token_header(self) -> None:
75
+ """
76
+ Add CSRF token header to the session.
77
+
78
+ Raises
79
+ ------
80
+ CSRFTokenNotFound
81
+ If the CSRF token is not found in the cookies.
82
+ """
83
+ token = self.session.cookies.get('csrftoken')
84
+ if not token:
85
+ raise CSRFTokenNotFound
86
+ self.session.headers.update({'x-csrftoken': token})
87
+
88
+ def graphql_query(self,
89
+ variables: Mapping[str, Any],
90
+ *,
91
+ cast_to: type[T],
92
+ doc_id: str = '9806959572732215') -> T | None:
93
+ """Make a GraphQL query."""
94
+ with self.session.post('https://www.instagram.com/graphql/query',
95
+ headers={
96
+ 'content-type': 'application/x-www-form-urlencoded',
97
+ } | API_HEADERS,
98
+ data={
99
+ 'doc_id': doc_id,
100
+ 'variables': json.dumps(variables, separators=(',', ':'))
101
+ }) as r:
102
+ if r.status_code != HTTPStatus.OK:
103
+ return None
104
+ data = r.json()
105
+ assert isinstance(data, dict)
106
+ if (status := data.get('status')) != 'ok':
107
+ log.error('GraphQL status not "ok": %s', status)
108
+ return None
109
+ if data.get('errors'):
110
+ log.warning('Response has errors.')
111
+ log.debug('Response: %s', json.dumps(data, indent=2))
112
+ if not data.get('data'):
113
+ log.error('No data in response.')
114
+ return cast('T', data['data'])
115
+
116
+ def get_text(self, url: str, *, params: Mapping[str, str] | None = None) -> str:
117
+ """Get text from a URL."""
118
+ with self.session.get(url, params=params, headers=API_HEADERS) as r:
119
+ r.raise_for_status()
120
+ return r.text
121
+
122
+ def highlights_tray(self, user_id: int | str) -> HighlightsTray:
123
+ """Get the highlights tray data for a user."""
124
+ return self.get_json(
125
+ f'https://i.instagram.com/api/v1/highlights/{user_id}/highlights_tray/',
126
+ cast_to=HighlightsTray)
127
+
128
+ def __enter__(self) -> Self: # pragma: no cover
129
+ """Recommended way to initialise the client."""
130
+ return self
131
+
132
+ def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
133
+ ___: TracebackType | None) -> None:
134
+ """Clean up."""
135
+
136
+ def is_saved(self, url: str) -> bool: # pragma: no cover
137
+ """Check if a URL is already saved."""
138
+ return False
105
139
 
106
- def _is_saved(self, url: str) -> bool:
107
- if self._no_log:
108
- return False
109
- self._cursor.execute('SELECT COUNT(url) FROM log WHERE url = ?', (_clean_url(url),))
110
- count: int
111
- count, = self._cursor.fetchone()
112
- return count == 1
140
+ def save_to_log(self, url: str) -> None:
141
+ """Save a URL to the log."""
113
142
 
114
- def _save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem,
115
- timestamp: int) -> None:
143
+ def save_image_versions2(self, sub_item: CarouselMedia | MediaInfoItem, timestamp: int) -> None:
144
+ """Save images in the image_versions2 dictionary."""
116
145
  def key(x: MediaInfoItemImageVersions2Candidate) -> int:
117
146
  return x['width'] * x['height']
118
147
 
119
- best = sorted(sub_item['image_versions2']['candidates'], key=key, reverse=True)[0]
120
- if self._is_saved(best['url']):
148
+ best = max(sub_item['image_versions2']['candidates'], key=key)
149
+ if self.is_saved(best['url']):
150
+ return
151
+ r = self.session.head(best['url'])
152
+ if r.status_code != HTTPStatus.OK:
153
+ log.warning('HEAD request failed with status code %s.', r.status_code)
121
154
  return
122
- r = self._session.head(best['url'])
123
- r.raise_for_status()
124
155
  ext = get_extension(r.headers['content-type'])
125
156
  name = f'{sub_item["id"]}.{ext}'
126
- with open(name, 'wb') as f:
127
- for content in (self._session.get(best['url'],
128
- stream=True).iter_content(chunk_size=512)):
129
- f.write(content)
157
+ with Path(name).open('wb') as f:
158
+ f.writelines(self.session.get(best['url'], stream=True).iter_content(chunk_size=512))
130
159
  utime(name, (timestamp, timestamp))
131
- self._save_to_log(r.url)
160
+ self.save_to_log(r.url)
132
161
 
133
- def _save_comments(self, edge: Edge) -> None:
134
- if self._get_comments:
135
- comment_url = ('https://www.instagram.com/api/v1/media/'
136
- f'{edge["node"]["id"]}/comments/')
137
- shared_params = dict(can_support_threading='true')
138
- top_comment_data = comment_data = self._get_rate_limited(
139
- comment_url,
140
- params={
141
- **shared_params, 'permalink_enabled': 'false'
142
- },
143
- cast_to=Comments)
144
- while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
145
- comment_data = self._get_rate_limited(comment_url,
146
- params={
147
- **shared_params,
148
- 'min_id':
149
- comment_data['next_min_id'],
150
- },
151
- cast_to=Comments)
152
- top_comment_data['comments'].extend(comment_data['comments'])
153
- comments_json = f'{edge["node"]["id"]}-comments.json'
154
- with open(comments_json, 'w+') as f:
155
- json.dump(top_comment_data, f, sort_keys=True, indent=2)
162
+ def save_comments(self, edge: Edge) -> None:
163
+ """Save comments for an edge node."""
164
+ comment_url = ('https://www.instagram.com/api/v1/media/'
165
+ f'{edge["node"]["id"]}/comments/')
166
+ shared_params = {'can_support_threading': 'true'}
167
+ try:
168
+ comment_data = self.get_json(comment_url,
169
+ params={
170
+ **shared_params, 'permalink_enabled': 'false'
171
+ },
172
+ cast_to=Comments)
173
+ except HTTPError:
174
+ log.exception('Failed to get comments.')
175
+ return
176
+ top_comment_data: Any = comment_data
177
+ while comment_data['can_view_more_preview_comments'] and comment_data['next_min_id']:
178
+ try:
179
+ comment_data = self.get_json(comment_url,
180
+ params={
181
+ **shared_params,
182
+ 'min_id':
183
+ comment_data['next_min_id'],
184
+ },
185
+ cast_to=Comments)
186
+ except HTTPError:
187
+ log.exception('Failed to get comments.')
188
+ break
189
+ top_comment_data['comments'] = (list(top_comment_data['comments']) +
190
+ list(comment_data['comments']))
191
+ comments_json = f'{edge["node"]["id"]}-comments.json'
192
+ with Path(comments_json).open('w+', encoding='utf-8') as f:
193
+ json.dump(top_comment_data, f, sort_keys=True, indent=2)
156
194
 
157
- def _save_media(self, edge: Edge) -> None:
158
- media_info_url = ('https://i.instagram.com/api/v1/media/'
159
- f'{edge["node"]["id"]}/info/')
160
- if self._is_saved(media_info_url):
195
+ def save_media(self, edge: Edge) -> None:
196
+ """Save media for an edge node."""
197
+ log.info('Saving media at URL: https://www.instagram.com/p/%s', edge['node']['code'])
198
+ media_info_url = f'https://www.instagram.com/p/{edge["node"]["code"]}/'
199
+ if self.is_saved(media_info_url):
200
+ return
201
+ r = self.session.get(media_info_url, headers=PAGE_FETCH_HEADERS)
202
+ if r.status_code != HTTPStatus.OK:
203
+ log.warning('GET request failed with status code %s.', r.status_code)
204
+ return
205
+ if 'image_versions2' not in r.text or 'taken_at' not in r.text:
206
+ log.warning('Invalid response. image_versions2 dict not found.')
161
207
  return
162
- media_info = self._get_rate_limited(media_info_url, cast_to=MediaInfo)
163
- if media_info['more_available'] or media_info['num_results'] != 1:
164
- pp(media_info)
165
- raise ValueError('Unhandled more_available set to True')
208
+ soup = Soup(r.text, 'html5lib')
209
+ media_info_embedded = next(
210
+ json.loads(s) for s in (''.join(
211
+ getattr(c, 'text', '') for c in getattr(script, 'contents', ''))
212
+ for script in soup.select('script[type="application/json"]'))
213
+ if 'image_versions2' in s and 'taken_at' in s)
214
+ media_info: MediaInfo = (
215
+ media_info_embedded['require'][0][3][0]['__bbox']['require'][0][3][1]['__bbox']
216
+ ['result']['data']['xdt_api__v1__media__shortcode__web_info'])
166
217
  timestamp = media_info['items'][0]['taken_at']
167
218
  id_json_file = f'{edge["node"]["id"]}.json'
168
219
  media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
@@ -170,140 +221,48 @@ class InstagramClient:
170
221
  write_if_new(media_info_json_file, str(json_dumps_formatted(media_info)))
171
222
  for file in (id_json_file, media_info_json_file):
172
223
  utime(file, (timestamp, timestamp))
173
- self._save_to_log(media_info_url)
224
+ self.save_to_log(media_info_url)
174
225
  for item in media_info['items']:
175
226
  timestamp = item['taken_at']
176
- if 'carousel_media' in item:
177
- for sub_item in item['carousel_media']:
178
- self._save_image_versions2(sub_item, timestamp)
227
+ if (carousel_media := item.get('carousel_media')):
228
+ for sub_item in carousel_media:
229
+ self.save_image_versions2(sub_item, timestamp)
179
230
  elif 'image_versions2' in item:
180
- self._save_image_versions2(item, timestamp)
231
+ self.save_image_versions2(item, timestamp)
181
232
 
182
- def _save_stuff(self, edges: Collection[Edge], parent_edge: Edge | None = None) -> None:
233
+ def save_edges(self, edges: Iterable[Edge], parent_edge: Edge | None = None) -> None:
234
+ """Save edge node media."""
183
235
  for edge in edges:
184
- if edge['node']['__typename'] == 'GraphVideo':
236
+ if edge['node']['__typename'] == 'XDTMediaDict':
185
237
  try:
186
- shortcode = edge['node']['shortcode']
187
- except KeyError as e:
238
+ shortcode = edge['node']['code']
239
+ except KeyError:
188
240
  if parent_edge:
189
241
  try:
190
- shortcode = parent_edge['node']['shortcode']
191
- except KeyError as exc:
192
- raise ValueError('Unknown shortcode') from exc
242
+ shortcode = parent_edge['node']['code']
243
+ except KeyError:
244
+ log.exception('Unknown shortcode.')
245
+ return
193
246
  else:
194
- raise ValueError('Unknown shortcode') from e
195
- self._add_video_url(f'https://www.instagram.com/p/{shortcode}')
196
- elif edge['node']['__typename'] == 'GraphImage':
197
- self._save_media(edge)
198
- elif edge['node']['__typename'] == 'GraphSidecar':
199
- logger.debug('Recursion into child edges')
200
- if (not edge['node']['comments_disabled']
201
- and edge['node']['edge_media_to_comment']['count']):
202
- self._save_comments(edge)
203
- self._save_stuff(edge['node']['edge_sidecar_to_children']['edges'], edge)
247
+ log.exception('Unknown shortcode.')
248
+ if edge['node'].get('video_dash_manifest'):
249
+ self.add_video_url(f'https://www.instagram.com/p/{shortcode}')
250
+ else:
251
+ try:
252
+ self.save_comments(edge)
253
+ self.save_media(edge)
254
+ except requests.exceptions.RetryError:
255
+ log.exception('Retries exhausted.')
256
+ return
204
257
  else:
205
- raise ValueError(f'Unknown type "{edge["node"]["__typename"]}"')
206
-
207
- @overload
208
- def _get_rate_limited(self, url: str, *, cast_to: Type[T]) -> T:
209
- pass
210
-
211
- @overload
212
- def _get_rate_limited(self,
213
- url: str,
214
- *,
215
- return_json: Literal[False] = False) -> requests.Response:
216
- pass
217
-
218
- @overload
219
- def _get_rate_limited(self,
220
- url: str,
221
- *,
222
- params: Mapping[str, str] | None = None,
223
- cast_to: Type[T]) -> T:
224
- pass
258
+ log.warning( # type: ignore[unreachable]
259
+ 'Unknown type: `%s`. Item %s will not be processed.',
260
+ edge['node']['__typename'], edge['node']['id'])
261
+ shortcode = edge['node']['code']
262
+ self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}')
225
263
 
226
- @sleep_and_retry
227
- @limits(calls=10, period=60)
228
- def _get_rate_limited(
229
- self,
230
- url: str,
231
- *,
232
- return_json: bool = True,
233
- params: Mapping[str, str] | None = None,
234
- cast_to: Type[T] | None = None) -> T | requests.Response: # pylint: disable=unused-argument
235
- with self._session.get(url, params=params) as r:
264
+ def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
265
+ """Get JSON data from a URL."""
266
+ with self.session.get(url, params=params, headers=API_HEADERS) as r:
236
267
  r.raise_for_status()
237
- return r.json() if return_json else r
238
-
239
- def _highlights_tray(self, user_id: int | str) -> HighlightsTray:
240
- return self._get_rate_limited(
241
- f'https://i.instagram.com/api/v1/highlights/{user_id}/'
242
- 'highlights_tray/',
243
- cast_to=HighlightsTray)
244
-
245
- def __enter__(self) -> 'InstagramClient':
246
- return self
247
-
248
- def __exit__(self, _: Type[BaseException], __: BaseException, ___: Traceback) -> None:
249
- self._cursor.close()
250
- self._connection.close()
251
-
252
- def process(self) -> None:
253
- with chdir(self._output_dir):
254
- self._get_rate_limited(f'https://www.instagram.com/{self._username}/',
255
- return_json=False)
256
- r = self._get_rate_limited('https://i.instagram.com/api/v1/users/web_profile_info/',
257
- params={'username': self._username},
258
- cast_to=WebProfileInfo)
259
- with open('web_profile_info.json', 'w') as f:
260
- json.dump(r, f, indent=2, sort_keys=True)
261
- user_info = r['data']['user']
262
- if not self._is_saved(user_info['profile_pic_url_hd']):
263
- with open('profile_pic.jpg', 'wb') as f:
264
- for chunk in self._session.get(user_info['profile_pic_url_hd'],
265
- stream=True).iter_content(chunk_size=512):
266
- f.write(chunk)
267
- self._save_to_log(user_info['profile_pic_url_hd'])
268
- for item in self._highlights_tray(user_info['id'])['tray']:
269
- self._add_video_url('https://www.instagram.com/stories/highlights/'
270
- f'{item["id"].split(":")[-1]}/')
271
- self._save_stuff(user_info['edge_owner_to_timeline_media']['edges'])
272
- page_info = user_info['edge_owner_to_timeline_media']['page_info']
273
- while page_info['has_next_page']:
274
- params = dict(query_hash='69cba40317214236af40e7efa697781d',
275
- variables=json.dumps(
276
- dict(id=user_info['id'], first=12,
277
- after=page_info['end_cursor'])))
278
- media = self._get_rate_limited(
279
- 'https://www.instagram.com/graphql/query/',
280
- params=params,
281
- cast_to=WebProfileInfo)['data']['user']['edge_owner_to_timeline_media']
282
- page_info = media['page_info']
283
- self._save_stuff(media['edges'])
284
- if len(self._video_urls) > 0:
285
- with yt_dlp.YoutubeDL({
286
- **SHARED_YT_DLP_OPTIONS, # type: ignore[misc]
287
- **{
288
- 'cookiesfrombrowser': [
289
- self._browser, self._browser_profile, None, None
290
- ],
291
- 'getcomments': self._get_comments,
292
- 'verbose': self._debug
293
- }
294
- }) as ydl:
295
- failed_urls: list[str] = []
296
- while (self._video_urls and (url := self._video_urls.pop())):
297
- if self._is_saved(url):
298
- logger.debug(f'{url} is already saved')
299
- continue
300
- if ydl.extract_info(url):
301
- logger.debug(f'Extracting {url}')
302
- self._save_to_log(url)
303
- else:
304
- failed_urls.append(url)
305
- if len(failed_urls) > 0:
306
- logger.error('Some video URIs failed. Check failed.txt.')
307
- with open('failed.txt', 'w') as f:
308
- for url in failed_urls:
309
- f.write(f'{url}\n')
268
+ return cast('T', r.json())