instagram-archiver 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of instagram-archiver might be problematic. Click here for more details.
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/PKG-INFO +2 -2
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/README.md +1 -1
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/__init__.py +1 -1
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/client.py +23 -19
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/main.py +7 -0
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/profile_scraper.py +23 -19
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/saved_scraper.py +2 -1
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/typing.py +3 -1
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/man/instagram-archiver.1 +2 -2
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/pyproject.toml +1 -1
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/LICENSE.txt +0 -0
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/__main__.py +0 -0
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/constants.py +0 -0
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/py.typed +0 -0
- {instagram_archiver-0.3.0 → instagram_archiver-0.3.2}/instagram_archiver/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: instagram-archiver
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Save Instagram content you have access to.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: command line,instagram
|
|
@@ -33,7 +33,7 @@ Description-Content-Type: text/markdown
|
|
|
33
33
|
[](https://pypi.org/project/instagram-archiver/)
|
|
34
34
|
[](https://github.com/Tatsh/instagram-archiver/tags)
|
|
35
35
|
[](https://github.com/Tatsh/instagram-archiver/blob/master/LICENSE.txt)
|
|
36
|
-
[](https://img.shields.io/github/commits-since/Tatsh/instagram-archiver/v0.3.2/master)](https://github.com/Tatsh/instagram-archiver/compare/v0.3.2...master)
|
|
37
37
|
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml)
|
|
38
38
|
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml)
|
|
39
39
|
[](https://coveralls.io/github/Tatsh/instagram-archiver?branch=master)
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
[](https://pypi.org/project/instagram-archiver/)
|
|
5
5
|
[](https://github.com/Tatsh/instagram-archiver/tags)
|
|
6
6
|
[](https://github.com/Tatsh/instagram-archiver/blob/master/LICENSE.txt)
|
|
7
|
-
[](https://img.shields.io/github/commits-since/Tatsh/instagram-archiver/v0.3.2/master)](https://github.com/Tatsh/instagram-archiver/compare/v0.3.2...master)
|
|
8
8
|
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml)
|
|
9
9
|
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml)
|
|
10
10
|
[](https://coveralls.io/github/Tatsh/instagram-archiver?branch=master)
|
|
@@ -8,12 +8,11 @@ from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
|
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
10
|
|
|
11
|
-
from bs4 import BeautifulSoup as Soup
|
|
12
11
|
from requests import HTTPError
|
|
13
12
|
from yt_dlp_utils import setup_session
|
|
14
13
|
import requests
|
|
15
14
|
|
|
16
|
-
from .constants import API_HEADERS,
|
|
15
|
+
from .constants import API_HEADERS, SHARED_HEADERS
|
|
17
16
|
from .typing import (
|
|
18
17
|
CarouselMedia,
|
|
19
18
|
Comments,
|
|
@@ -31,7 +30,7 @@ if TYPE_CHECKING:
|
|
|
31
30
|
|
|
32
31
|
from .typing import BrowserName
|
|
33
32
|
|
|
34
|
-
__all__ = ('CSRFTokenNotFound', 'InstagramClient')
|
|
33
|
+
__all__ = ('CSRFTokenNotFound', 'InstagramClient', 'UnexpectedRedirect')
|
|
35
34
|
|
|
36
35
|
T = TypeVar('T')
|
|
37
36
|
log = logging.getLogger(__name__)
|
|
@@ -41,6 +40,10 @@ class CSRFTokenNotFound(RuntimeError):
|
|
|
41
40
|
"""CSRF token not found in cookies."""
|
|
42
41
|
|
|
43
42
|
|
|
43
|
+
class UnexpectedRedirect(RuntimeError):
|
|
44
|
+
"""Unexpected redirect in a request."""
|
|
45
|
+
|
|
46
|
+
|
|
44
47
|
class InstagramClient:
|
|
45
48
|
"""Generic client for Instagram."""
|
|
46
49
|
def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
|
|
@@ -59,7 +62,6 @@ class InstagramClient:
|
|
|
59
62
|
browser_profile,
|
|
60
63
|
SHARED_HEADERS,
|
|
61
64
|
domains={'instagram.com'},
|
|
62
|
-
setup_retry=True,
|
|
63
65
|
status_forcelist=(413, 429, 500, 502, 503, 504))
|
|
64
66
|
self.failed_urls: set[str] = set()
|
|
65
67
|
"""Set of failed URLs."""
|
|
@@ -193,27 +195,29 @@ class InstagramClient:
|
|
|
193
195
|
json.dump(top_comment_data, f, sort_keys=True, indent=2)
|
|
194
196
|
|
|
195
197
|
def save_media(self, edge: Edge) -> None:
|
|
196
|
-
"""
|
|
197
|
-
|
|
198
|
-
|
|
198
|
+
"""
|
|
199
|
+
Save media for an edge node.
|
|
200
|
+
|
|
201
|
+
Raises
|
|
202
|
+
------
|
|
203
|
+
UnexpectedRedirect
|
|
204
|
+
If a redirect occurs unexpectedly.
|
|
205
|
+
"""
|
|
206
|
+
media_info_url = f'https://www.instagram.com/api/v1/media/{edge["node"]["pk"]}/info/'
|
|
207
|
+
log.info('Saving media at URL: %s', media_info_url)
|
|
199
208
|
if self.is_saved(media_info_url):
|
|
200
209
|
return
|
|
201
|
-
r = self.session.get(media_info_url, headers=
|
|
210
|
+
r = self.session.get(media_info_url, headers=API_HEADERS, allow_redirects=False)
|
|
202
211
|
if r.status_code != HTTPStatus.OK:
|
|
212
|
+
if r.status_code in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}:
|
|
213
|
+
raise UnexpectedRedirect
|
|
203
214
|
log.warning('GET request failed with status code %s.', r.status_code)
|
|
215
|
+
log.debug('Content: %s', r.text)
|
|
204
216
|
return
|
|
205
217
|
if 'image_versions2' not in r.text or 'taken_at' not in r.text:
|
|
206
218
|
log.warning('Invalid response. image_versions2 dict not found.')
|
|
207
219
|
return
|
|
208
|
-
|
|
209
|
-
media_info_embedded = next(
|
|
210
|
-
json.loads(s) for s in (''.join(
|
|
211
|
-
getattr(c, 'text', '') for c in getattr(script, 'contents', ''))
|
|
212
|
-
for script in soup.select('script[type="application/json"]'))
|
|
213
|
-
if 'image_versions2' in s and 'taken_at' in s)
|
|
214
|
-
media_info: MediaInfo = (
|
|
215
|
-
media_info_embedded['require'][0][3][0]['__bbox']['require'][0][3][1]['__bbox']
|
|
216
|
-
['result']['data']['xdt_api__v1__media__shortcode__web_info'])
|
|
220
|
+
media_info: MediaInfo = r.json()
|
|
217
221
|
timestamp = media_info['items'][0]['taken_at']
|
|
218
222
|
id_json_file = f'{edge["node"]["id"]}.json'
|
|
219
223
|
media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
|
|
@@ -246,7 +250,7 @@ class InstagramClient:
|
|
|
246
250
|
else:
|
|
247
251
|
log.exception('Unknown shortcode.')
|
|
248
252
|
if edge['node'].get('video_dash_manifest'):
|
|
249
|
-
self.add_video_url(f'https://www.instagram.com/p/{shortcode}')
|
|
253
|
+
self.add_video_url(f'https://www.instagram.com/p/{shortcode}/')
|
|
250
254
|
else:
|
|
251
255
|
try:
|
|
252
256
|
self.save_comments(edge)
|
|
@@ -259,7 +263,7 @@ class InstagramClient:
|
|
|
259
263
|
'Unknown type: `%s`. Item %s will not be processed.',
|
|
260
264
|
edge['node']['__typename'], edge['node']['id'])
|
|
261
265
|
shortcode = edge['node']['code']
|
|
262
|
-
self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}')
|
|
266
|
+
self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}/')
|
|
263
267
|
|
|
264
268
|
def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
|
|
265
269
|
"""Get JSON data from a URL."""
|
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
|
|
|
6
6
|
|
|
7
7
|
import click
|
|
8
8
|
|
|
9
|
+
from .client import UnexpectedRedirect
|
|
9
10
|
from .constants import BROWSER_CHOICES
|
|
10
11
|
from .profile_scraper import ProfileScraper
|
|
11
12
|
from .saved_scraper import SavedScraper
|
|
@@ -55,6 +56,9 @@ def main(output_dir: str,
|
|
|
55
56
|
if '%(username)s' in output_dir else Path(output_dir)),
|
|
56
57
|
username=username) as client:
|
|
57
58
|
client.process()
|
|
59
|
+
except UnexpectedRedirect as e:
|
|
60
|
+
click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
|
|
61
|
+
raise click.Abort from e
|
|
58
62
|
except Exception as e:
|
|
59
63
|
if isinstance(e, KeyboardInterrupt) or debug:
|
|
60
64
|
raise
|
|
@@ -91,6 +95,9 @@ def save_saved_main(output_dir: str,
|
|
|
91
95
|
setup_logging(debug=debug)
|
|
92
96
|
try:
|
|
93
97
|
SavedScraper(browser, profile, output_dir, comments=include_comments).process(unsave=unsave)
|
|
98
|
+
except UnexpectedRedirect as e:
|
|
99
|
+
click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
|
|
100
|
+
raise click.Abort from e
|
|
94
101
|
except Exception as e:
|
|
95
102
|
if isinstance(e, KeyboardInterrupt) or debug:
|
|
96
103
|
raise
|
|
@@ -116,22 +116,26 @@ class ProfileScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
|
|
|
116
116
|
r = self.get_json('https://i.instagram.com/api/v1/users/web_profile_info/',
|
|
117
117
|
params={'username': self._username},
|
|
118
118
|
cast_to=WebProfileInfo)
|
|
119
|
-
|
|
120
|
-
json.
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
self.
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
119
|
+
if 'data' in r:
|
|
120
|
+
with Path('web_profile_info.json').open('w', encoding='utf-8') as f:
|
|
121
|
+
json.dump(r, f, indent=2, sort_keys=True)
|
|
122
|
+
user_info = r['data']['user']
|
|
123
|
+
if not self.is_saved(user_info['profile_pic_url_hd']):
|
|
124
|
+
with Path('profile_pic.jpg').open('wb') as f:
|
|
125
|
+
f.writelines(
|
|
126
|
+
self.session.get(user_info['profile_pic_url_hd'],
|
|
127
|
+
stream=True).iter_content(chunk_size=512))
|
|
128
|
+
self.save_to_log(user_info['profile_pic_url_hd'])
|
|
129
|
+
try:
|
|
130
|
+
for item in self.highlights_tray(user_info['id'])['tray']:
|
|
131
|
+
self.add_video_url('https://www.instagram.com/stories/highlights/'
|
|
132
|
+
f'{item["id"].split(":")[-1]}/')
|
|
133
|
+
except HTTPError:
|
|
134
|
+
log.exception('Failed to get highlights data.')
|
|
135
|
+
self.save_edges(user_info['edge_owner_to_timeline_media']['edges'])
|
|
136
|
+
else:
|
|
137
|
+
log.warning(
|
|
138
|
+
'Failed to get user info. Profile information and image will not be saved.')
|
|
135
139
|
d = self.graphql_query(
|
|
136
140
|
{
|
|
137
141
|
'data': {
|
|
@@ -180,15 +184,15 @@ class ProfileScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
|
|
|
180
184
|
with get_configured_yt_dlp() as ydl:
|
|
181
185
|
while self.video_urls and (url := self.video_urls.pop()):
|
|
182
186
|
if self.is_saved(url):
|
|
183
|
-
log.info('
|
|
187
|
+
log.info('%s is already saved.', url)
|
|
184
188
|
continue
|
|
185
189
|
if ydl.extract_info(url):
|
|
186
|
-
log.info('
|
|
190
|
+
log.info('Downloading video: %s', url)
|
|
187
191
|
self.save_to_log(url)
|
|
188
192
|
else:
|
|
189
193
|
self.failed_urls.add(url)
|
|
190
194
|
if self.failed_urls:
|
|
191
|
-
log.warning('Some
|
|
195
|
+
log.warning('Some URIs failed. Check failed.txt.')
|
|
192
196
|
with Path('failed.txt').open('w', encoding='utf-8') as f:
|
|
193
197
|
for url in self.failed_urls:
|
|
194
198
|
f.write(f'{url}\n')
|
|
@@ -20,7 +20,7 @@ __all__ = ('SavedScraper',)
|
|
|
20
20
|
log = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class SavedScraper(
|
|
23
|
+
class SavedScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
|
|
24
24
|
"""Scrape saved posts."""
|
|
25
25
|
def __init__(
|
|
26
26
|
self,
|
|
@@ -69,6 +69,7 @@ class SavedScraper(InstagramClient, SaveCommentsCheckDisabledMixin):
|
|
|
69
69
|
'id': item['media']['id'],
|
|
70
70
|
'code': item['media']['code'],
|
|
71
71
|
'owner': item['media']['owner'],
|
|
72
|
+
'pk': item['media']['pk'],
|
|
72
73
|
'video_dash_manifest': item['media'].get('video_dash_manifest')
|
|
73
74
|
}
|
|
74
75
|
} for item in feed['items'])
|
|
@@ -131,6 +131,8 @@ class XDTMediaDict(TypedDict):
|
|
|
131
131
|
"""Media ID."""
|
|
132
132
|
owner: Owner
|
|
133
133
|
"""Owner information."""
|
|
134
|
+
pk: str
|
|
135
|
+
"""Primary key. Also carousel ID."""
|
|
134
136
|
video_dash_manifest: NotRequired[str | None]
|
|
135
137
|
"""Video dash manifest URL, if available."""
|
|
136
138
|
|
|
@@ -161,7 +163,7 @@ class WebProfileInfoData(TypedDict):
|
|
|
161
163
|
|
|
162
164
|
class WebProfileInfo(TypedDict):
|
|
163
165
|
"""Profile information container."""
|
|
164
|
-
data: WebProfileInfoData
|
|
166
|
+
data: NotRequired[WebProfileInfoData]
|
|
165
167
|
"""Profile data."""
|
|
166
168
|
|
|
167
169
|
|
|
@@ -27,9 +27,9 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
|
|
|
27
27
|
.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]]
|
|
28
28
|
.in \\n[rst2man-indent\\n[rst2man-indent-level]]u
|
|
29
29
|
..
|
|
30
|
-
.TH "INSTAGRAM-ARCHIVER" "1" "May 12, 2025" "0.3.
|
|
30
|
+
.TH "INSTAGRAM-ARCHIVER" "1" "May 12, 2025" "0.3.2" "instagram-archiver"
|
|
31
31
|
.SH NAME
|
|
32
|
-
instagram-archiver \- instagram-archiver v0.3.
|
|
32
|
+
instagram-archiver \- instagram-archiver v0.3.2
|
|
33
33
|
.SH COMMANDS
|
|
34
34
|
.SS instagram\-archiver
|
|
35
35
|
.sp
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|