instagram-archiver 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ Metadata-Version: 2.1
2
+ Name: instagram-archiver
3
+ Version: 0.0.5
4
+ Summary: Archive Instagram content.
5
+ Home-page: https://github.com/Tatsh/instgram-archiver
6
+ Author: Andrew Udvare
7
+ Author-email: audvare@gmail.com
8
+ License: MIT
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ Provides-Extra: dev
12
+
13
+ # Instagram Archiver
14
+
15
+ Tool to download data from an Instagram profile you have access to. It downloads the images, videos, and related metadata (stored as JSON files).
16
+
17
+ ## Installation
18
+
19
+ ```shell
20
+ pip install instagram-archiver
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```shell
26
+ ia USERNAME
27
+ ```
28
+
29
+ If you like to see output, pass `--debug`.
@@ -0,0 +1,17 @@
1
+ # Instagram Archiver
2
+
3
+ Tool to download data from an Instagram profile you have access to. It downloads the images, videos, and related metadata (stored as JSON files).
4
+
5
+ ## Installation
6
+
7
+ ```shell
8
+ pip install instagram-archiver
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```shell
14
+ ia USERNAME
15
+ ```
16
+
17
+ If you like to see output, pass `--debug`.
@@ -0,0 +1,3 @@
1
+ from .main import main
2
+
3
+ __all__ = ('main',)
@@ -0,0 +1,33 @@
1
+ from typing import Final, Mapping
2
+
3
+ __all__ = ('SHARED_HEADERS', 'USER_AGENT')
4
+
5
+ USER_AGENT: Final[str] = ('Mozilla/5.0 (X11; Linux x86_64) '
6
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
7
+ 'Chrome/112.0.0.0 Safari/537.36')
8
+ SHARED_HEADERS: Final[Mapping[str, str]] = {
9
+ 'accept':
10
+ ('text/html,application/xhtml+xml,application/xml;q=0.9,image/jxl,'
11
+ 'image/avif,image/webp,image/apng,*/*;q=0.8,'
12
+ 'application/signed-exchange;v=b3;q=0.9'),
13
+ 'accept-language':
14
+ 'en,en-GB;q=0.9,en-US;q=0.8',
15
+ 'authority':
16
+ 'www.instagram.com',
17
+ 'cache-control':
18
+ 'no-cache',
19
+ 'dnt':
20
+ '1',
21
+ 'pragma':
22
+ 'no-cache',
23
+ 'referer':
24
+ 'https://www.instagram.com',
25
+ 'upgrade-insecure-requests':
26
+ '1',
27
+ 'user-agent':
28
+ USER_AGENT,
29
+ 'viewport-width':
30
+ '2560',
31
+ 'x-ig-app-id':
32
+ '936619743392459'
33
+ }
@@ -0,0 +1,26 @@
1
+ from typing import Any, Sequence, TypedDict
2
+
3
+
4
+ class MediaInfoItemVideoVersion(TypedDict):
5
+ height: int
6
+ url: str
7
+ width: int
8
+
9
+
10
+ class MediaInfoItemImageVersions2Candidate(TypedDict):
11
+ height: int
12
+ url: str
13
+ width: int
14
+
15
+
16
+ class MediaInfoItemImageVersions2(TypedDict):
17
+ candidates: Sequence[MediaInfoItemImageVersions2Candidate]
18
+
19
+
20
+ class MediaInfoItem(TypedDict):
21
+ image_versions2: MediaInfoItemImageVersions2
22
+ taken_at: int
23
+ user: Any
24
+ video_dash_manifest: str
25
+ video_duration: float
26
+ video_versions: Sequence[MediaInfoItemVideoVersion]
@@ -0,0 +1,175 @@
1
+ from os import chdir, makedirs
2
+ from os.path import isfile
3
+ from pathlib import Path
4
+ from typing import Any
5
+ import json
6
+ import re
7
+ import sys
8
+
9
+ from loguru import logger
10
+ from requests.adapters import HTTPAdapter
11
+ from urllib3.util.retry import Retry
12
+ from yt_dlp.cookies import extract_cookies_from_browser
13
+ import click
14
+ import requests
15
+ import yt_dlp
16
+
17
+ from .constants import SHARED_HEADERS
18
+ from .utils import (YoutubeDLLogger, get_extension, setup_logging,
19
+ write_if_new)
20
+
21
+
22
+ def highlights_tray(session: requests.Session, user_id: int | str) -> Any:
23
+ with session.get(f'https://i.instagram.com/api/v1/highlights/{user_id}/'
24
+ 'highlights_tray/') as r:
25
+ r.raise_for_status()
26
+ return r.json()
27
+
28
+
29
+ @click.command()
30
+ @click.option('-o',
31
+ '--output-dir',
32
+ default=None,
33
+ help='Output directory',
34
+ type=click.Path(exists=True))
35
+ @click.option('-b',
36
+ '--browser',
37
+ default='chrome',
38
+ help='Browser to read cookies from')
39
+ @click.option('-p', '--profile', default='Default', help='Browser profile')
40
+ @click.option('-d', '--debug', is_flag=True, help='Enable debug output')
41
+ @click.argument('username')
42
+ def main(output_dir: Path | str | None,
43
+ browser: str,
44
+ profile: str,
45
+ username: str,
46
+ debug: bool = False) -> None:
47
+ setup_logging(debug)
48
+ if output_dir is None:
49
+ output_dir = Path('.', username)
50
+ makedirs(output_dir, exist_ok=True)
51
+ chdir(output_dir)
52
+ with requests.Session() as session:
53
+ session.mount(
54
+ 'https://',
55
+ HTTPAdapter(max_retries=Retry(backoff_factor=2.5,
56
+ status_forcelist=(
57
+ 429,
58
+ 500,
59
+ 502,
60
+ 503,
61
+ 504,
62
+ ))))
63
+ session.headers.update({
64
+ **SHARED_HEADERS,
65
+ **dict(cookie='; '.join(f'{cookie.name}={cookie.value}' \
66
+ for cookie in extract_cookies_from_browser(browser, profile)
67
+ if 'instagram.com' in cookie.domain))
68
+ })
69
+ r = session.get('https://www.instagram.com')
70
+ r.raise_for_status()
71
+ r = session.get(f'https://www.instagram.com/{username}/')
72
+ r.raise_for_status()
73
+ m = re.search(r'"config":{"csrf_token":"([^"]+)"', r.text)
74
+ assert m is not None
75
+ session.headers.update({'x-csrftoken': m.group(1)})
76
+ r = session.get(
77
+ 'https://i.instagram.com/api/v1/users/web_profile_info/',
78
+ params={'username': username})
79
+ r.raise_for_status()
80
+ with open('web_profile_info.json', 'wb') as f:
81
+ f.write(r.content)
82
+ user_info = r.json()['data']['user']
83
+ r = session.get(user_info['profile_pic_url_hd'])
84
+ r.raise_for_status()
85
+ with open('profile_pic.jpg', 'wb') as f:
86
+ f.write(r.content)
87
+ video_urls = []
88
+
89
+ # for item in highlights_tray(session, user_info['id'])['tray']:
90
+ # video_urls.append('https://www.instagram.com/stories/highlights/'
91
+ # f'{item["id"].split(":")[-1]}/')
92
+ # sys.argv = [sys.argv[0]]
93
+ # ydl_opts = yt_dlp.parse_options()[-1]
94
+ # with yt_dlp.YoutubeDL({
95
+ # **ydl_opts,
96
+ # **dict(http_headers=SHARED_HEADERS,
97
+ # logger=YoutubeDLLogger(),
98
+ # verbose=debug)
99
+ # }) as ydl:
100
+ # for url in video_urls:
101
+ # ydl.extract_info(url)
102
+
103
+ def save_stuff(edges: Any) -> None:
104
+ nonlocal video_urls
105
+ for edge in edges:
106
+ shortcode = edge['node']['shortcode']
107
+ if edge['node']['__typename'] == 'GraphVideo':
108
+ video_urls.append(
109
+ f'https://www.instagram.com/p/{shortcode}')
110
+ elif edge['node']['__typename'] == 'GraphImage':
111
+ r = session.head(edge['node']['display_url'])
112
+ r.raise_for_status()
113
+ ext = get_extension(r.headers['content-type'])
114
+ name = f'{edge["node"]["id"]}.{ext}'
115
+ if not isfile(name):
116
+ r = session.get(edge['node']['display_url'])
117
+ r.raise_for_status()
118
+ write_if_new(name, r.content, 'wb')
119
+ write_if_new(f'{edge["node"]["id"]}.json',
120
+ json.dumps(edge['node']))
121
+ elif edge['node']['__typename'] == 'GraphSidecar':
122
+ r = session.get('https://i.instagram.com/api/v1/media/'
123
+ f'{edge["node"]["id"]}/info/')
124
+ item = r.json()['items'][0]
125
+ r.raise_for_status()
126
+ write_if_new(f'{edge["node"]["id"]}.json',
127
+ json.dumps(item))
128
+ for item in item['carousel_media']:
129
+ best = sorted(item['image_versions2']['candidates'],
130
+ key=lambda x: x['width'] * x['height'],
131
+ reverse=True)[0]
132
+ r = session.head(best['url'])
133
+ r.raise_for_status()
134
+ ext = get_extension(r.headers['content-type'])
135
+ name = f'{item["id"]}.{ext}'
136
+ if not isfile(name):
137
+ r = session.get(best['url'])
138
+ r.raise_for_status()
139
+ write_if_new(name, r.content, 'wb')
140
+
141
+ save_stuff(user_info['edge_owner_to_timeline_media']['edges'])
142
+ page_info = user_info['edge_owner_to_timeline_media']['page_info']
143
+ while page_info['has_next_page']:
144
+ params = dict(query_hash='69cba40317214236af40e7efa697781d',
145
+ variables=json.dumps(
146
+ dict(id=user_info['id'],
147
+ first=12,
148
+ after=page_info['end_cursor'])))
149
+ r = session.get('https://www.instagram.com/graphql/query/',
150
+ params=params)
151
+ r.raise_for_status()
152
+ media = r.json()['data']['user']['edge_owner_to_timeline_media']
153
+ page_info = media['page_info']
154
+ save_stuff(media['edges'])
155
+ sys.argv = [sys.argv[0]]
156
+ ydl_opts = yt_dlp.parse_options()[-1]
157
+ if len(video_urls) > 0:
158
+ with yt_dlp.YoutubeDL({
159
+ **ydl_opts,
160
+ **dict(http_headers=SHARED_HEADERS,
161
+ logger=YoutubeDLLogger(),
162
+ verbose=debug)
163
+ }) as ydl:
164
+ failed_urls = []
165
+ for url in video_urls:
166
+ if (not ydl.in_download_archive(
167
+ dict(id=url.split('/')[-1],
168
+ extractor_key='instagram'))
169
+ and not ydl.extract_info(url, ie_key='Instagram')):
170
+ failed_urls.append(url)
171
+ if len(failed_urls) > 0:
172
+ logger.error('Some video URIs failed. Check failed.txt.')
173
+ with open('failed.txt', 'w') as f:
174
+ for url in failed_urls:
175
+ f.write(f'{url}\n')
@@ -0,0 +1,85 @@
1
+ from os.path import isfile
2
+ from pathlib import Path
3
+ from types import FrameType
4
+ from typing import (Literal, Optional, Union)
5
+ import logging
6
+ import sys
7
+
8
+ from loguru import logger
9
+ import click
10
+
11
+ __all__ = ('UnknownMimetypeError', 'get_extension', 'write_if_new')
12
+
13
+
14
+ def write_if_new(target: Union[Path, str],
15
+ content: Union[str, bytes],
16
+ mode: str = 'w') -> None:
17
+ if not isfile(target):
18
+ with click.open_file(str(target), mode) as f:
19
+ f.write(content)
20
+
21
+
22
+ class UnknownMimetypeError(Exception):
23
+ pass
24
+
25
+
26
+ def get_extension(mimetype: str) -> Literal['png', 'jpg']:
27
+ if mimetype == 'image/jpeg':
28
+ return 'jpg'
29
+ if mimetype == 'image/png':
30
+ return 'png'
31
+ raise UnknownMimetypeError(mimetype)
32
+
33
+
34
+ class InterceptHandler(logging.Handler): # pragma: no cover
35
+ """Intercept handler taken from Loguru's documentation."""
36
+ def emit(self, record: logging.LogRecord) -> None:
37
+ level: Union[str, int]
38
+ # Get corresponding Loguru level if it exists
39
+ try:
40
+ level = logger.level(record.levelname).name
41
+ except ValueError:
42
+ level = record.levelno
43
+ # Find caller from where originated the logged message
44
+ frame: Optional[FrameType] = logging.currentframe()
45
+ depth = 2
46
+ while frame and frame.f_code.co_filename == logging.__file__:
47
+ frame = frame.f_back
48
+ depth += 1
49
+ logger.opt(depth=depth, exception=record.exc_info).log(
50
+ level, record.getMessage())
51
+
52
+
53
+ def setup_log_intercept_handler() -> None: # pragma: no cover
54
+ """Sets up Loguru to intercept records from the logging module."""
55
+ logging.basicConfig(handlers=(InterceptHandler(),), level=0)
56
+
57
+
58
+ def setup_logging(debug: Optional[bool] = False) -> None:
59
+ """Shared function to enable logging."""
60
+ if debug: # pragma: no cover
61
+ setup_log_intercept_handler()
62
+ logger.enable('')
63
+ else:
64
+ logger.configure(handlers=(dict(
65
+ format='<level>{message}</level>',
66
+ level='INFO',
67
+ sink=sys.stderr,
68
+ ),))
69
+
70
+
71
+ class YoutubeDLLogger:
72
+ def debug(self, message: str) -> None:
73
+ if message.startswith('[debug] '):
74
+ logger.debug(message)
75
+ else:
76
+ logger.info(message)
77
+
78
+ def info(self, _: str) -> None:
79
+ pass
80
+
81
+ def warning(self, message: str) -> None:
82
+ logger.warning(message)
83
+
84
+ def error(self, message: str) -> None:
85
+ logger.error(message)
@@ -0,0 +1,29 @@
1
+ Metadata-Version: 2.1
2
+ Name: instagram-archiver
3
+ Version: 0.0.5
4
+ Summary: Archive Instagram content.
5
+ Home-page: https://github.com/Tatsh/instgram-archiver
6
+ Author: Andrew Udvare
7
+ Author-email: audvare@gmail.com
8
+ License: MIT
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ Provides-Extra: dev
12
+
13
+ # Instagram Archiver
14
+
15
+ Tool to download data from an Instagram profile you have access to. It downloads the images, videos, and related metadata (stored as JSON files).
16
+
17
+ ## Installation
18
+
19
+ ```shell
20
+ pip install instagram-archiver
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```shell
26
+ ia USERNAME
27
+ ```
28
+
29
+ If you like to see output, pass `--debug`.
@@ -0,0 +1,13 @@
1
+ README.md
2
+ setup.py
3
+ instagram_archiver/__init__.py
4
+ instagram_archiver/constants.py
5
+ instagram_archiver/ig_typing.py
6
+ instagram_archiver/main.py
7
+ instagram_archiver/utils.py
8
+ instagram_archiver.egg-info/PKG-INFO
9
+ instagram_archiver.egg-info/SOURCES.txt
10
+ instagram_archiver.egg-info/dependency_links.txt
11
+ instagram_archiver.egg-info/entry_points.txt
12
+ instagram_archiver.egg-info/requires.txt
13
+ instagram_archiver.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ ia = instagram_archiver:main
@@ -0,0 +1,12 @@
1
+ click>=8.0.0
2
+ loguru>=0.5.3
3
+ requests
4
+ yt-dlp>=2022.7.18
5
+
6
+ [dev]
7
+ mypy
8
+ mypy-extensions
9
+ pylint
10
+ pylint-quotes
11
+ rope
12
+ types-requests>=2.25.9
@@ -0,0 +1 @@
1
+ instagram_archiver
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,24 @@
1
+ from setuptools import find_packages, setup
2
+
3
+ with open('README.md') as f:
4
+ setup(author='Andrew Udvare',
5
+ author_email='audvare@gmail.com',
6
+ description='Archive Instagram content.',
7
+ entry_points={'console_scripts': ['ia = instagram_archiver:main']},
8
+ extras_require={
9
+ 'dev': [
10
+ 'mypy', 'mypy-extensions', 'pylint', 'pylint-quotes', 'rope',
11
+ 'types-requests>=2.25.9'
12
+ ]
13
+ },
14
+ install_requires=[
15
+ 'click>=8.0.0', 'loguru>=0.5.3', 'requests', 'yt-dlp>=2022.7.18'
16
+ ],
17
+ license='MIT',
18
+ long_description=f.read(),
19
+ long_description_content_type='text/markdown',
20
+ name='instagram-archiver',
21
+ packages=find_packages(),
22
+ python_requires='>=3.9',
23
+ url='https://github.com/Tatsh/instgram-archiver',
24
+ version='0.0.5')