instagram-archiver 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of instagram-archiver might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: instagram-archiver
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Save Instagram content you have access to.
5
5
  License: MIT
6
6
  Keywords: command line,instagram
@@ -33,7 +33,7 @@ Description-Content-Type: text/markdown
33
33
  [![PyPI - Version](https://img.shields.io/pypi/v/instagram-archiver)](https://pypi.org/project/instagram-archiver/)
34
34
  [![GitHub tag (with filter)](https://img.shields.io/github/v/tag/Tatsh/instagram-archiver)](https://github.com/Tatsh/instagram-archiver/tags)
35
35
  [![License](https://img.shields.io/github/license/Tatsh/instagram-archiver)](https://github.com/Tatsh/instagram-archiver/blob/master/LICENSE.txt)
36
- [![GitHub commits since latest release (by SemVer including pre-releases)](https://img.shields.io/github/commits-since/Tatsh/instagram-archiver/v0.3.0/master)](https://github.com/Tatsh/instagram-archiver/compare/v0.3.0...master)
36
+ [![GitHub commits since latest release (by SemVer including pre-releases)](https://img.shields.io/github/commits-since/Tatsh/instagram-archiver/v0.3.2/master)](https://github.com/Tatsh/instagram-archiver/compare/v0.3.2...master)
37
37
  [![QA](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml/badge.svg)](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml)
38
38
  [![Tests](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml/badge.svg)](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml)
39
39
  [![Coverage Status](https://coveralls.io/repos/github/Tatsh/instagram-archiver/badge.svg?branch=master)](https://coveralls.io/github/Tatsh/instagram-archiver?branch=master)
@@ -4,7 +4,7 @@
4
4
  [![PyPI - Version](https://img.shields.io/pypi/v/instagram-archiver)](https://pypi.org/project/instagram-archiver/)
5
5
  [![GitHub tag (with filter)](https://img.shields.io/github/v/tag/Tatsh/instagram-archiver)](https://github.com/Tatsh/instagram-archiver/tags)
6
6
  [![License](https://img.shields.io/github/license/Tatsh/instagram-archiver)](https://github.com/Tatsh/instagram-archiver/blob/master/LICENSE.txt)
7
- [![GitHub commits since latest release (by SemVer including pre-releases)](https://img.shields.io/github/commits-since/Tatsh/instagram-archiver/v0.3.0/master)](https://github.com/Tatsh/instagram-archiver/compare/v0.3.0...master)
7
+ [![GitHub commits since latest release (by SemVer including pre-releases)](https://img.shields.io/github/commits-since/Tatsh/instagram-archiver/v0.3.2/master)](https://github.com/Tatsh/instagram-archiver/compare/v0.3.2...master)
8
8
  [![QA](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml/badge.svg)](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml)
9
9
  [![Tests](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml/badge.svg)](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml)
10
10
  [![Coverage Status](https://coveralls.io/repos/github/Tatsh/instagram-archiver/badge.svg?branch=master)](https://coveralls.io/github/Tatsh/instagram-archiver?branch=master)
@@ -6,4 +6,4 @@ from .profile_scraper import ProfileScraper
6
6
  from .saved_scraper import SavedScraper
7
7
 
8
8
  __all__ = ('InstagramClient', 'ProfileScraper', 'SavedScraper')
9
- __version__ = 'v0.3.0'
9
+ __version__ = 'v0.3.2'
@@ -8,12 +8,11 @@ from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
8
8
  import json
9
9
  import logging
10
10
 
11
- from bs4 import BeautifulSoup as Soup
12
11
  from requests import HTTPError
13
12
  from yt_dlp_utils import setup_session
14
13
  import requests
15
14
 
16
- from .constants import API_HEADERS, PAGE_FETCH_HEADERS, SHARED_HEADERS
15
+ from .constants import API_HEADERS, SHARED_HEADERS
17
16
  from .typing import (
18
17
  CarouselMedia,
19
18
  Comments,
@@ -31,7 +30,7 @@ if TYPE_CHECKING:
31
30
 
32
31
  from .typing import BrowserName
33
32
 
34
- __all__ = ('CSRFTokenNotFound', 'InstagramClient')
33
+ __all__ = ('CSRFTokenNotFound', 'InstagramClient', 'UnexpectedRedirect')
35
34
 
36
35
  T = TypeVar('T')
37
36
  log = logging.getLogger(__name__)
@@ -41,6 +40,10 @@ class CSRFTokenNotFound(RuntimeError):
41
40
  """CSRF token not found in cookies."""
42
41
 
43
42
 
43
+ class UnexpectedRedirect(RuntimeError):
44
+ """Unexpected redirect in a request."""
45
+
46
+
44
47
  class InstagramClient:
45
48
  """Generic client for Instagram."""
46
49
  def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
@@ -59,7 +62,6 @@ class InstagramClient:
59
62
  browser_profile,
60
63
  SHARED_HEADERS,
61
64
  domains={'instagram.com'},
62
- setup_retry=True,
63
65
  status_forcelist=(413, 429, 500, 502, 503, 504))
64
66
  self.failed_urls: set[str] = set()
65
67
  """Set of failed URLs."""
@@ -193,27 +195,29 @@ class InstagramClient:
193
195
  json.dump(top_comment_data, f, sort_keys=True, indent=2)
194
196
 
195
197
  def save_media(self, edge: Edge) -> None:
196
- """Save media for an edge node."""
197
- log.info('Saving media at URL: https://www.instagram.com/p/%s', edge['node']['code'])
198
- media_info_url = f'https://www.instagram.com/p/{edge["node"]["code"]}/'
198
+ """
199
+ Save media for an edge node.
200
+
201
+ Raises
202
+ ------
203
+ UnexpectedRedirect
204
+ If a redirect occurs unexpectedly.
205
+ """
206
+ media_info_url = f'https://www.instagram.com/api/v1/media/{edge["node"]["pk"]}/info/'
207
+ log.info('Saving media at URL: %s', media_info_url)
199
208
  if self.is_saved(media_info_url):
200
209
  return
201
- r = self.session.get(media_info_url, headers=PAGE_FETCH_HEADERS)
210
+ r = self.session.get(media_info_url, headers=API_HEADERS, allow_redirects=False)
202
211
  if r.status_code != HTTPStatus.OK:
212
+ if r.status_code in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}:
213
+ raise UnexpectedRedirect
203
214
  log.warning('GET request failed with status code %s.', r.status_code)
215
+ log.debug('Content: %s', r.text)
204
216
  return
205
217
  if 'image_versions2' not in r.text or 'taken_at' not in r.text:
206
218
  log.warning('Invalid response. image_versions2 dict not found.')
207
219
  return
208
- soup = Soup(r.text, 'html5lib')
209
- media_info_embedded = next(
210
- json.loads(s) for s in (''.join(
211
- getattr(c, 'text', '') for c in getattr(script, 'contents', ''))
212
- for script in soup.select('script[type="application/json"]'))
213
- if 'image_versions2' in s and 'taken_at' in s)
214
- media_info: MediaInfo = (
215
- media_info_embedded['require'][0][3][0]['__bbox']['require'][0][3][1]['__bbox']
216
- ['result']['data']['xdt_api__v1__media__shortcode__web_info'])
220
+ media_info: MediaInfo = r.json()
217
221
  timestamp = media_info['items'][0]['taken_at']
218
222
  id_json_file = f'{edge["node"]["id"]}.json'
219
223
  media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
@@ -246,7 +250,7 @@ class InstagramClient:
246
250
  else:
247
251
  log.exception('Unknown shortcode.')
248
252
  if edge['node'].get('video_dash_manifest'):
249
- self.add_video_url(f'https://www.instagram.com/p/{shortcode}')
253
+ self.add_video_url(f'https://www.instagram.com/p/{shortcode}/')
250
254
  else:
251
255
  try:
252
256
  self.save_comments(edge)
@@ -259,7 +263,7 @@ class InstagramClient:
259
263
  'Unknown type: `%s`. Item %s will not be processed.',
260
264
  edge['node']['__typename'], edge['node']['id'])
261
265
  shortcode = edge['node']['code']
262
- self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}')
266
+ self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}/')
263
267
 
264
268
  def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
265
269
  """Get JSON data from a URL."""
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
6
6
 
7
7
  import click
8
8
 
9
+ from .client import UnexpectedRedirect
9
10
  from .constants import BROWSER_CHOICES
10
11
  from .profile_scraper import ProfileScraper
11
12
  from .saved_scraper import SavedScraper
@@ -55,6 +56,9 @@ def main(output_dir: str,
55
56
  if '%(username)s' in output_dir else Path(output_dir)),
56
57
  username=username) as client:
57
58
  client.process()
59
+ except UnexpectedRedirect as e:
60
+ click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
61
+ raise click.Abort from e
58
62
  except Exception as e:
59
63
  if isinstance(e, KeyboardInterrupt) or debug:
60
64
  raise
@@ -91,6 +95,9 @@ def save_saved_main(output_dir: str,
91
95
  setup_logging(debug=debug)
92
96
  try:
93
97
  SavedScraper(browser, profile, output_dir, comments=include_comments).process(unsave=unsave)
98
+ except UnexpectedRedirect as e:
99
+ click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
100
+ raise click.Abort from e
94
101
  except Exception as e:
95
102
  if isinstance(e, KeyboardInterrupt) or debug:
96
103
  raise
@@ -116,22 +116,26 @@ class ProfileScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
116
116
  r = self.get_json('https://i.instagram.com/api/v1/users/web_profile_info/',
117
117
  params={'username': self._username},
118
118
  cast_to=WebProfileInfo)
119
- with Path('web_profile_info.json').open('w', encoding='utf-8') as f:
120
- json.dump(r, f, indent=2, sort_keys=True)
121
- user_info = r['data']['user']
122
- if not self.is_saved(user_info['profile_pic_url_hd']):
123
- with Path('profile_pic.jpg').open('wb') as f:
124
- f.writelines(
125
- self.session.get(user_info['profile_pic_url_hd'],
126
- stream=True).iter_content(chunk_size=512))
127
- self.save_to_log(user_info['profile_pic_url_hd'])
128
- try:
129
- for item in self.highlights_tray(user_info['id'])['tray']:
130
- self.add_video_url('https://www.instagram.com/stories/highlights/'
131
- f'{item["id"].split(":")[-1]}/')
132
- except HTTPError:
133
- log.exception('Failed to get highlights data.')
134
- self.save_edges(user_info['edge_owner_to_timeline_media']['edges'])
119
+ if 'data' in r:
120
+ with Path('web_profile_info.json').open('w', encoding='utf-8') as f:
121
+ json.dump(r, f, indent=2, sort_keys=True)
122
+ user_info = r['data']['user']
123
+ if not self.is_saved(user_info['profile_pic_url_hd']):
124
+ with Path('profile_pic.jpg').open('wb') as f:
125
+ f.writelines(
126
+ self.session.get(user_info['profile_pic_url_hd'],
127
+ stream=True).iter_content(chunk_size=512))
128
+ self.save_to_log(user_info['profile_pic_url_hd'])
129
+ try:
130
+ for item in self.highlights_tray(user_info['id'])['tray']:
131
+ self.add_video_url('https://www.instagram.com/stories/highlights/'
132
+ f'{item["id"].split(":")[-1]}/')
133
+ except HTTPError:
134
+ log.exception('Failed to get highlights data.')
135
+ self.save_edges(user_info['edge_owner_to_timeline_media']['edges'])
136
+ else:
137
+ log.warning(
138
+ 'Failed to get user info. Profile information and image will not be saved.')
135
139
  d = self.graphql_query(
136
140
  {
137
141
  'data': {
@@ -180,15 +184,15 @@ class ProfileScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
180
184
  with get_configured_yt_dlp() as ydl:
181
185
  while self.video_urls and (url := self.video_urls.pop()):
182
186
  if self.is_saved(url):
183
- log.info('`%s` is already saved.', url)
187
+ log.info('%s is already saved.', url)
184
188
  continue
185
189
  if ydl.extract_info(url):
186
- log.info('Extracting `%s`.', url)
190
+ log.info('Downloading video: %s', url)
187
191
  self.save_to_log(url)
188
192
  else:
189
193
  self.failed_urls.add(url)
190
194
  if self.failed_urls:
191
- log.warning('Some video URIs failed. Check failed.txt.')
195
+ log.warning('Some URIs failed. Check failed.txt.')
192
196
  with Path('failed.txt').open('w', encoding='utf-8') as f:
193
197
  for url in self.failed_urls:
194
198
  f.write(f'{url}\n')
@@ -20,7 +20,7 @@ __all__ = ('SavedScraper',)
20
20
  log = logging.getLogger(__name__)
21
21
 
22
22
 
23
- class SavedScraper(InstagramClient, SaveCommentsCheckDisabledMixin):
23
+ class SavedScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
24
24
  """Scrape saved posts."""
25
25
  def __init__(
26
26
  self,
@@ -69,6 +69,7 @@ class SavedScraper(InstagramClient, SaveCommentsCheckDisabledMixin):
69
69
  'id': item['media']['id'],
70
70
  'code': item['media']['code'],
71
71
  'owner': item['media']['owner'],
72
+ 'pk': item['media']['pk'],
72
73
  'video_dash_manifest': item['media'].get('video_dash_manifest')
73
74
  }
74
75
  } for item in feed['items'])
@@ -131,6 +131,8 @@ class XDTMediaDict(TypedDict):
131
131
  """Media ID."""
132
132
  owner: Owner
133
133
  """Owner information."""
134
+ pk: str
135
+ """Primary key. Also carousel ID."""
134
136
  video_dash_manifest: NotRequired[str | None]
135
137
  """Video dash manifest URL, if available."""
136
138
 
@@ -161,7 +163,7 @@ class WebProfileInfoData(TypedDict):
161
163
 
162
164
  class WebProfileInfo(TypedDict):
163
165
  """Profile information container."""
164
- data: WebProfileInfoData
166
+ data: NotRequired[WebProfileInfoData]
165
167
  """Profile data."""
166
168
 
167
169
 
@@ -27,9 +27,9 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
27
27
  .\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
28
28
  .in \\n[rst2man-indent\\n[rst2man-indent-level]]u
29
29
  ..
30
- .TH "INSTAGRAM-ARCHIVER" "1" "May 12, 2025" "0.3.0" "instagram-archiver"
30
+ .TH "INSTAGRAM-ARCHIVER" "1" "May 12, 2025" "0.3.2" "instagram-archiver"
31
31
  .SH NAME
32
- instagram-archiver \- instagram-archiver v0.3.0
32
+ instagram-archiver \- instagram-archiver v0.3.2
33
33
  .SH COMMANDS
34
34
  .SS instagram\-archiver
35
35
  .sp
@@ -18,7 +18,7 @@ keywords = ["command line", "instagram"]
18
18
  license = "MIT"
19
19
  name = "instagram-archiver"
20
20
  readme = "README.md"
21
- version = "0.3.0"
21
+ version = "0.3.2"
22
22
 
23
23
  [[project.authors]]
24
24
  email = "audvare@gmail.com"