instagram-archiver 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of instagram-archiver might be problematic. Click here for more details.
- instagram_archiver/__init__.py +1 -1
- instagram_archiver/client.py +23 -19
- instagram_archiver/main.py +7 -0
- instagram_archiver/saved_scraper.py +2 -1
- instagram_archiver/typing.py +2 -0
- {instagram_archiver-0.3.0.dist-info → instagram_archiver-0.3.1.dist-info}/METADATA +2 -2
- instagram_archiver-0.3.1.dist-info/RECORD +15 -0
- instagram_archiver-0.3.0.dist-info/RECORD +0 -15
- {instagram_archiver-0.3.0.dist-info → instagram_archiver-0.3.1.dist-info}/LICENSE.txt +0 -0
- {instagram_archiver-0.3.0.dist-info → instagram_archiver-0.3.1.dist-info}/WHEEL +0 -0
- {instagram_archiver-0.3.0.dist-info → instagram_archiver-0.3.1.dist-info}/entry_points.txt +0 -0
instagram_archiver/__init__.py
CHANGED
instagram_archiver/client.py
CHANGED
|
@@ -8,12 +8,11 @@ from typing import TYPE_CHECKING, Any, Self, TypeVar, cast
|
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
10
|
|
|
11
|
-
from bs4 import BeautifulSoup as Soup
|
|
12
11
|
from requests import HTTPError
|
|
13
12
|
from yt_dlp_utils import setup_session
|
|
14
13
|
import requests
|
|
15
14
|
|
|
16
|
-
from .constants import API_HEADERS,
|
|
15
|
+
from .constants import API_HEADERS, SHARED_HEADERS
|
|
17
16
|
from .typing import (
|
|
18
17
|
CarouselMedia,
|
|
19
18
|
Comments,
|
|
@@ -31,7 +30,7 @@ if TYPE_CHECKING:
|
|
|
31
30
|
|
|
32
31
|
from .typing import BrowserName
|
|
33
32
|
|
|
34
|
-
__all__ = ('CSRFTokenNotFound', 'InstagramClient')
|
|
33
|
+
__all__ = ('CSRFTokenNotFound', 'InstagramClient', 'UnexpectedRedirect')
|
|
35
34
|
|
|
36
35
|
T = TypeVar('T')
|
|
37
36
|
log = logging.getLogger(__name__)
|
|
@@ -41,6 +40,10 @@ class CSRFTokenNotFound(RuntimeError):
|
|
|
41
40
|
"""CSRF token not found in cookies."""
|
|
42
41
|
|
|
43
42
|
|
|
43
|
+
class UnexpectedRedirect(RuntimeError):
|
|
44
|
+
"""Unexpected redirect in a request."""
|
|
45
|
+
|
|
46
|
+
|
|
44
47
|
class InstagramClient:
|
|
45
48
|
"""Generic client for Instagram."""
|
|
46
49
|
def __init__(self, browser: BrowserName = 'chrome', browser_profile: str = 'Default') -> None:
|
|
@@ -59,7 +62,6 @@ class InstagramClient:
|
|
|
59
62
|
browser_profile,
|
|
60
63
|
SHARED_HEADERS,
|
|
61
64
|
domains={'instagram.com'},
|
|
62
|
-
setup_retry=True,
|
|
63
65
|
status_forcelist=(413, 429, 500, 502, 503, 504))
|
|
64
66
|
self.failed_urls: set[str] = set()
|
|
65
67
|
"""Set of failed URLs."""
|
|
@@ -193,27 +195,29 @@ class InstagramClient:
|
|
|
193
195
|
json.dump(top_comment_data, f, sort_keys=True, indent=2)
|
|
194
196
|
|
|
195
197
|
def save_media(self, edge: Edge) -> None:
|
|
196
|
-
"""
|
|
197
|
-
|
|
198
|
-
|
|
198
|
+
"""
|
|
199
|
+
Save media for an edge node.
|
|
200
|
+
|
|
201
|
+
Raises
|
|
202
|
+
------
|
|
203
|
+
UnexpectedRedirect
|
|
204
|
+
If a redirect occurs unexpectedly.
|
|
205
|
+
"""
|
|
206
|
+
media_info_url = f'https://www.instagram.com/api/v1/media/{edge["node"]["pk"]}/info/'
|
|
207
|
+
log.info('Saving media at URL: %s', media_info_url)
|
|
199
208
|
if self.is_saved(media_info_url):
|
|
200
209
|
return
|
|
201
|
-
r = self.session.get(media_info_url, headers=
|
|
210
|
+
r = self.session.get(media_info_url, headers=API_HEADERS, allow_redirects=False)
|
|
202
211
|
if r.status_code != HTTPStatus.OK:
|
|
212
|
+
if r.status_code in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}:
|
|
213
|
+
raise UnexpectedRedirect
|
|
203
214
|
log.warning('GET request failed with status code %s.', r.status_code)
|
|
215
|
+
log.debug('Content: %s', r.text)
|
|
204
216
|
return
|
|
205
217
|
if 'image_versions2' not in r.text or 'taken_at' not in r.text:
|
|
206
218
|
log.warning('Invalid response. image_versions2 dict not found.')
|
|
207
219
|
return
|
|
208
|
-
|
|
209
|
-
media_info_embedded = next(
|
|
210
|
-
json.loads(s) for s in (''.join(
|
|
211
|
-
getattr(c, 'text', '') for c in getattr(script, 'contents', ''))
|
|
212
|
-
for script in soup.select('script[type="application/json"]'))
|
|
213
|
-
if 'image_versions2' in s and 'taken_at' in s)
|
|
214
|
-
media_info: MediaInfo = (
|
|
215
|
-
media_info_embedded['require'][0][3][0]['__bbox']['require'][0][3][1]['__bbox']
|
|
216
|
-
['result']['data']['xdt_api__v1__media__shortcode__web_info'])
|
|
220
|
+
media_info: MediaInfo = r.json()
|
|
217
221
|
timestamp = media_info['items'][0]['taken_at']
|
|
218
222
|
id_json_file = f'{edge["node"]["id"]}.json'
|
|
219
223
|
media_info_json_file = f'{edge["node"]["id"]}-media-info-0000.json'
|
|
@@ -246,7 +250,7 @@ class InstagramClient:
|
|
|
246
250
|
else:
|
|
247
251
|
log.exception('Unknown shortcode.')
|
|
248
252
|
if edge['node'].get('video_dash_manifest'):
|
|
249
|
-
self.add_video_url(f'https://www.instagram.com/p/{shortcode}')
|
|
253
|
+
self.add_video_url(f'https://www.instagram.com/p/{shortcode}/')
|
|
250
254
|
else:
|
|
251
255
|
try:
|
|
252
256
|
self.save_comments(edge)
|
|
@@ -259,7 +263,7 @@ class InstagramClient:
|
|
|
259
263
|
'Unknown type: `%s`. Item %s will not be processed.',
|
|
260
264
|
edge['node']['__typename'], edge['node']['id'])
|
|
261
265
|
shortcode = edge['node']['code']
|
|
262
|
-
self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}')
|
|
266
|
+
self.failed_urls.add(f'https://www.instagram.com/p/{shortcode}/')
|
|
263
267
|
|
|
264
268
|
def get_json(self, url: str, *, cast_to: type[T], params: Mapping[str, str] | None = None) -> T:
|
|
265
269
|
"""Get JSON data from a URL."""
|
instagram_archiver/main.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
|
|
|
6
6
|
|
|
7
7
|
import click
|
|
8
8
|
|
|
9
|
+
from .client import UnexpectedRedirect
|
|
9
10
|
from .constants import BROWSER_CHOICES
|
|
10
11
|
from .profile_scraper import ProfileScraper
|
|
11
12
|
from .saved_scraper import SavedScraper
|
|
@@ -55,6 +56,9 @@ def main(output_dir: str,
|
|
|
55
56
|
if '%(username)s' in output_dir else Path(output_dir)),
|
|
56
57
|
username=username) as client:
|
|
57
58
|
client.process()
|
|
59
|
+
except UnexpectedRedirect as e:
|
|
60
|
+
click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
|
|
61
|
+
raise click.Abort from e
|
|
58
62
|
except Exception as e:
|
|
59
63
|
if isinstance(e, KeyboardInterrupt) or debug:
|
|
60
64
|
raise
|
|
@@ -91,6 +95,9 @@ def save_saved_main(output_dir: str,
|
|
|
91
95
|
setup_logging(debug=debug)
|
|
92
96
|
try:
|
|
93
97
|
SavedScraper(browser, profile, output_dir, comments=include_comments).process(unsave=unsave)
|
|
98
|
+
except UnexpectedRedirect as e:
|
|
99
|
+
click.echo('Unexpected redirect. Assuming request limit has been reached.', err=True)
|
|
100
|
+
raise click.Abort from e
|
|
94
101
|
except Exception as e:
|
|
95
102
|
if isinstance(e, KeyboardInterrupt) or debug:
|
|
96
103
|
raise
|
|
@@ -20,7 +20,7 @@ __all__ = ('SavedScraper',)
|
|
|
20
20
|
log = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class SavedScraper(
|
|
23
|
+
class SavedScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
|
|
24
24
|
"""Scrape saved posts."""
|
|
25
25
|
def __init__(
|
|
26
26
|
self,
|
|
@@ -69,6 +69,7 @@ class SavedScraper(InstagramClient, SaveCommentsCheckDisabledMixin):
|
|
|
69
69
|
'id': item['media']['id'],
|
|
70
70
|
'code': item['media']['code'],
|
|
71
71
|
'owner': item['media']['owner'],
|
|
72
|
+
'pk': item['media']['pk'],
|
|
72
73
|
'video_dash_manifest': item['media'].get('video_dash_manifest')
|
|
73
74
|
}
|
|
74
75
|
} for item in feed['items'])
|
instagram_archiver/typing.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: instagram-archiver
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Save Instagram content you have access to.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: command line,instagram
|
|
@@ -33,7 +33,7 @@ Description-Content-Type: text/markdown
|
|
|
33
33
|
[](https://pypi.org/project/instagram-archiver/)
|
|
34
34
|
[](https://github.com/Tatsh/instagram-archiver/tags)
|
|
35
35
|
[](https://github.com/Tatsh/instagram-archiver/blob/master/LICENSE.txt)
|
|
36
|
-
[](https://img.shields.io/github/commits-since/Tatsh/instagram-archiver/v0.3.1/master)](https://github.com/Tatsh/instagram-archiver/compare/v0.3.1...master)
|
|
37
37
|
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/qa.yml)
|
|
38
38
|
[](https://github.com/Tatsh/instagram-archiver/actions/workflows/tests.yml)
|
|
39
39
|
[](https://coveralls.io/github/Tatsh/instagram-archiver?branch=master)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
instagram_archiver/__init__.py,sha256=FtYFsiDxC03kcF2BoC_fCDdSIV_Q81cMBQIeYhA9hwk,270
|
|
2
|
+
instagram_archiver/__main__.py,sha256=oQ6s6zvZTBiEOgt-qep3bDY9ayxSanQr7KHzr6ENK0o,115
|
|
3
|
+
instagram_archiver/client.py,sha256=xde2O-ADvyFw5q7bZxbX9R2ae02ToJUElw-IKSbQhss,11091
|
|
4
|
+
instagram_archiver/constants.py,sha256=NJ8QlQZviY3dwwrIONThK_G9VcvAzOQM6Yg-hSyaj9A,1459
|
|
5
|
+
instagram_archiver/main.py,sha256=5blHCN2PSNGdTlIvXPyWr5vbgM6fW1Wn9nbzJUtBS2Y,4173
|
|
6
|
+
instagram_archiver/profile_scraper.py,sha256=BGnJZD3rF2e4aHpqSvG7UBoAiDjTyrpcYKSzsDgHO3M,7988
|
|
7
|
+
instagram_archiver/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
instagram_archiver/saved_scraper.py,sha256=PyrRCQ00x65nyiIUvmMw5-paJ6_aVrl5xmcTrHi5tHo,2769
|
|
9
|
+
instagram_archiver/typing.py,sha256=svEWoYwGXhkbdISw1r6tuOzo_5TwrhQTn3q0VxzBIlY,4359
|
|
10
|
+
instagram_archiver/utils.py,sha256=l6f0W_brZhVPOjlKwoFYYum7ICyHJXpboTU7ANIQSPI,3842
|
|
11
|
+
instagram_archiver-0.3.1.dist-info/LICENSE.txt,sha256=cDLmbhzFwEUz5FL_OnA6Jp9zdz80330J6YyEq-00yNQ,1093
|
|
12
|
+
instagram_archiver-0.3.1.dist-info/METADATA,sha256=qoIzF1FkbuqwVB9npS6JE7xS5o-fVdVSf9k4XqvDISs,5925
|
|
13
|
+
instagram_archiver-0.3.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
14
|
+
instagram_archiver-0.3.1.dist-info/entry_points.txt,sha256=kNXd0Sy6896DEBRcx2mVYiaE-OR9-XR56MpWuaNa49g,128
|
|
15
|
+
instagram_archiver-0.3.1.dist-info/RECORD,,
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
instagram_archiver/__init__.py,sha256=406v5g-tu6QhUfdMTNgI9rYbPYm7WfjiIL68HOXLXQY,270
|
|
2
|
-
instagram_archiver/__main__.py,sha256=oQ6s6zvZTBiEOgt-qep3bDY9ayxSanQr7KHzr6ENK0o,115
|
|
3
|
-
instagram_archiver/client.py,sha256=uMYembbrYWKJdXlBpuYsSK-p7tuj0qG3WMVV8M785NE,11326
|
|
4
|
-
instagram_archiver/constants.py,sha256=NJ8QlQZviY3dwwrIONThK_G9VcvAzOQM6Yg-hSyaj9A,1459
|
|
5
|
-
instagram_archiver/main.py,sha256=lW8rHPjQpqNH8TqjitW3DMILazn09sSWxJhVlkUC5Ck,3808
|
|
6
|
-
instagram_archiver/profile_scraper.py,sha256=BGnJZD3rF2e4aHpqSvG7UBoAiDjTyrpcYKSzsDgHO3M,7988
|
|
7
|
-
instagram_archiver/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
instagram_archiver/saved_scraper.py,sha256=PHSXkmK-MC-42Z4GZzNCXa8dSD682NEav1rms7mL-yk,2722
|
|
9
|
-
instagram_archiver/typing.py,sha256=J8TcQftpIY-IVsbx007e1WbA-XdLUhBuEWWAisFlpHA,4306
|
|
10
|
-
instagram_archiver/utils.py,sha256=l6f0W_brZhVPOjlKwoFYYum7ICyHJXpboTU7ANIQSPI,3842
|
|
11
|
-
instagram_archiver-0.3.0.dist-info/LICENSE.txt,sha256=cDLmbhzFwEUz5FL_OnA6Jp9zdz80330J6YyEq-00yNQ,1093
|
|
12
|
-
instagram_archiver-0.3.0.dist-info/METADATA,sha256=Vq8yWfCHOzOJduL9E0PEcIGz9qu900uiI96c1g0q3-I,5925
|
|
13
|
-
instagram_archiver-0.3.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
14
|
-
instagram_archiver-0.3.0.dist-info/entry_points.txt,sha256=kNXd0Sy6896DEBRcx2mVYiaE-OR9-XR56MpWuaNa49g,128
|
|
15
|
-
instagram_archiver-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|