instagram-archiver 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instagram-archiver-0.0.5/PKG-INFO +29 -0
- instagram-archiver-0.0.5/README.md +17 -0
- instagram-archiver-0.0.5/instagram_archiver/__init__.py +3 -0
- instagram-archiver-0.0.5/instagram_archiver/constants.py +33 -0
- instagram-archiver-0.0.5/instagram_archiver/ig_typing.py +26 -0
- instagram-archiver-0.0.5/instagram_archiver/main.py +175 -0
- instagram-archiver-0.0.5/instagram_archiver/utils.py +85 -0
- instagram-archiver-0.0.5/instagram_archiver.egg-info/PKG-INFO +29 -0
- instagram-archiver-0.0.5/instagram_archiver.egg-info/SOURCES.txt +13 -0
- instagram-archiver-0.0.5/instagram_archiver.egg-info/dependency_links.txt +1 -0
- instagram-archiver-0.0.5/instagram_archiver.egg-info/entry_points.txt +2 -0
- instagram-archiver-0.0.5/instagram_archiver.egg-info/requires.txt +12 -0
- instagram-archiver-0.0.5/instagram_archiver.egg-info/top_level.txt +1 -0
- instagram-archiver-0.0.5/setup.cfg +4 -0
- instagram-archiver-0.0.5/setup.py +24 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: instagram-archiver
|
|
3
|
+
Version: 0.0.5
|
|
4
|
+
Summary: Archive Instagram content.
|
|
5
|
+
Home-page: https://github.com/Tatsh/instgram-archiver
|
|
6
|
+
Author: Andrew Udvare
|
|
7
|
+
Author-email: audvare@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
|
|
13
|
+
# Instagram Archiver
|
|
14
|
+
|
|
15
|
+
Tool to download data from an Instagram profile you have access to. It downloads the images, videos, and related metadata (stored as JSON files).
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```shell
|
|
20
|
+
pip install instagram-archiver
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```shell
|
|
26
|
+
ia USERNAME
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
If you like to see output, pass `--debug`.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Instagram Archiver
|
|
2
|
+
|
|
3
|
+
Tool to download data from an Instagram profile you have access to. It downloads the images, videos, and related metadata (stored as JSON files).
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```shell
|
|
8
|
+
pip install instagram-archiver
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```shell
|
|
14
|
+
ia USERNAME
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
If you like to see output, pass `--debug`.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Final, Mapping
|
|
2
|
+
|
|
3
|
+
__all__ = ('SHARED_HEADERS', 'USER_AGENT')
|
|
4
|
+
|
|
5
|
+
USER_AGENT: Final[str] = ('Mozilla/5.0 (X11; Linux x86_64) '
|
|
6
|
+
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
7
|
+
'Chrome/112.0.0.0 Safari/537.36')
|
|
8
|
+
SHARED_HEADERS: Final[Mapping[str, str]] = {
|
|
9
|
+
'accept':
|
|
10
|
+
('text/html,application/xhtml+xml,application/xml;q=0.9,image/jxl,'
|
|
11
|
+
'image/avif,image/webp,image/apng,*/*;q=0.8,'
|
|
12
|
+
'application/signed-exchange;v=b3;q=0.9'),
|
|
13
|
+
'accept-language':
|
|
14
|
+
'en,en-GB;q=0.9,en-US;q=0.8',
|
|
15
|
+
'authority':
|
|
16
|
+
'www.instagram.com',
|
|
17
|
+
'cache-control':
|
|
18
|
+
'no-cache',
|
|
19
|
+
'dnt':
|
|
20
|
+
'1',
|
|
21
|
+
'pragma':
|
|
22
|
+
'no-cache',
|
|
23
|
+
'referer':
|
|
24
|
+
'https://www.instagram.com',
|
|
25
|
+
'upgrade-insecure-requests':
|
|
26
|
+
'1',
|
|
27
|
+
'user-agent':
|
|
28
|
+
USER_AGENT,
|
|
29
|
+
'viewport-width':
|
|
30
|
+
'2560',
|
|
31
|
+
'x-ig-app-id':
|
|
32
|
+
'936619743392459'
|
|
33
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import Any, Sequence, TypedDict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MediaInfoItemVideoVersion(TypedDict):
|
|
5
|
+
height: int
|
|
6
|
+
url: str
|
|
7
|
+
width: int
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MediaInfoItemImageVersions2Candidate(TypedDict):
|
|
11
|
+
height: int
|
|
12
|
+
url: str
|
|
13
|
+
width: int
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MediaInfoItemImageVersions2(TypedDict):
|
|
17
|
+
candidates: Sequence[MediaInfoItemImageVersions2Candidate]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MediaInfoItem(TypedDict):
|
|
21
|
+
image_versions2: MediaInfoItemImageVersions2
|
|
22
|
+
taken_at: int
|
|
23
|
+
user: Any
|
|
24
|
+
video_dash_manifest: str
|
|
25
|
+
video_duration: float
|
|
26
|
+
video_versions: Sequence[MediaInfoItemVideoVersion]
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from os import chdir, makedirs
|
|
2
|
+
from os.path import isfile
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from requests.adapters import HTTPAdapter
|
|
11
|
+
from urllib3.util.retry import Retry
|
|
12
|
+
from yt_dlp.cookies import extract_cookies_from_browser
|
|
13
|
+
import click
|
|
14
|
+
import requests
|
|
15
|
+
import yt_dlp
|
|
16
|
+
|
|
17
|
+
from .constants import SHARED_HEADERS
|
|
18
|
+
from .utils import (YoutubeDLLogger, get_extension, setup_logging,
|
|
19
|
+
write_if_new)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def highlights_tray(session: requests.Session, user_id: int | str) -> Any:
|
|
23
|
+
with session.get(f'https://i.instagram.com/api/v1/highlights/{user_id}/'
|
|
24
|
+
'highlights_tray/') as r:
|
|
25
|
+
r.raise_for_status()
|
|
26
|
+
return r.json()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@click.command()
|
|
30
|
+
@click.option('-o',
|
|
31
|
+
'--output-dir',
|
|
32
|
+
default=None,
|
|
33
|
+
help='Output directory',
|
|
34
|
+
type=click.Path(exists=True))
|
|
35
|
+
@click.option('-b',
|
|
36
|
+
'--browser',
|
|
37
|
+
default='chrome',
|
|
38
|
+
help='Browser to read cookies from')
|
|
39
|
+
@click.option('-p', '--profile', default='Default', help='Browser profile')
|
|
40
|
+
@click.option('-d', '--debug', is_flag=True, help='Enable debug output')
|
|
41
|
+
@click.argument('username')
|
|
42
|
+
def main(output_dir: Path | str | None,
|
|
43
|
+
browser: str,
|
|
44
|
+
profile: str,
|
|
45
|
+
username: str,
|
|
46
|
+
debug: bool = False) -> None:
|
|
47
|
+
setup_logging(debug)
|
|
48
|
+
if output_dir is None:
|
|
49
|
+
output_dir = Path('.', username)
|
|
50
|
+
makedirs(output_dir, exist_ok=True)
|
|
51
|
+
chdir(output_dir)
|
|
52
|
+
with requests.Session() as session:
|
|
53
|
+
session.mount(
|
|
54
|
+
'https://',
|
|
55
|
+
HTTPAdapter(max_retries=Retry(backoff_factor=2.5,
|
|
56
|
+
status_forcelist=(
|
|
57
|
+
429,
|
|
58
|
+
500,
|
|
59
|
+
502,
|
|
60
|
+
503,
|
|
61
|
+
504,
|
|
62
|
+
))))
|
|
63
|
+
session.headers.update({
|
|
64
|
+
**SHARED_HEADERS,
|
|
65
|
+
**dict(cookie='; '.join(f'{cookie.name}={cookie.value}' \
|
|
66
|
+
for cookie in extract_cookies_from_browser(browser, profile)
|
|
67
|
+
if 'instagram.com' in cookie.domain))
|
|
68
|
+
})
|
|
69
|
+
r = session.get('https://www.instagram.com')
|
|
70
|
+
r.raise_for_status()
|
|
71
|
+
r = session.get(f'https://www.instagram.com/{username}/')
|
|
72
|
+
r.raise_for_status()
|
|
73
|
+
m = re.search(r'"config":{"csrf_token":"([^"]+)"', r.text)
|
|
74
|
+
assert m is not None
|
|
75
|
+
session.headers.update({'x-csrftoken': m.group(1)})
|
|
76
|
+
r = session.get(
|
|
77
|
+
'https://i.instagram.com/api/v1/users/web_profile_info/',
|
|
78
|
+
params={'username': username})
|
|
79
|
+
r.raise_for_status()
|
|
80
|
+
with open('web_profile_info.json', 'wb') as f:
|
|
81
|
+
f.write(r.content)
|
|
82
|
+
user_info = r.json()['data']['user']
|
|
83
|
+
r = session.get(user_info['profile_pic_url_hd'])
|
|
84
|
+
r.raise_for_status()
|
|
85
|
+
with open('profile_pic.jpg', 'wb') as f:
|
|
86
|
+
f.write(r.content)
|
|
87
|
+
video_urls = []
|
|
88
|
+
|
|
89
|
+
# for item in highlights_tray(session, user_info['id'])['tray']:
|
|
90
|
+
# video_urls.append('https://www.instagram.com/stories/highlights/'
|
|
91
|
+
# f'{item["id"].split(":")[-1]}/')
|
|
92
|
+
# sys.argv = [sys.argv[0]]
|
|
93
|
+
# ydl_opts = yt_dlp.parse_options()[-1]
|
|
94
|
+
# with yt_dlp.YoutubeDL({
|
|
95
|
+
# **ydl_opts,
|
|
96
|
+
# **dict(http_headers=SHARED_HEADERS,
|
|
97
|
+
# logger=YoutubeDLLogger(),
|
|
98
|
+
# verbose=debug)
|
|
99
|
+
# }) as ydl:
|
|
100
|
+
# for url in video_urls:
|
|
101
|
+
# ydl.extract_info(url)
|
|
102
|
+
|
|
103
|
+
def save_stuff(edges: Any) -> None:
|
|
104
|
+
nonlocal video_urls
|
|
105
|
+
for edge in edges:
|
|
106
|
+
shortcode = edge['node']['shortcode']
|
|
107
|
+
if edge['node']['__typename'] == 'GraphVideo':
|
|
108
|
+
video_urls.append(
|
|
109
|
+
f'https://www.instagram.com/p/{shortcode}')
|
|
110
|
+
elif edge['node']['__typename'] == 'GraphImage':
|
|
111
|
+
r = session.head(edge['node']['display_url'])
|
|
112
|
+
r.raise_for_status()
|
|
113
|
+
ext = get_extension(r.headers['content-type'])
|
|
114
|
+
name = f'{edge["node"]["id"]}.{ext}'
|
|
115
|
+
if not isfile(name):
|
|
116
|
+
r = session.get(edge['node']['display_url'])
|
|
117
|
+
r.raise_for_status()
|
|
118
|
+
write_if_new(name, r.content, 'wb')
|
|
119
|
+
write_if_new(f'{edge["node"]["id"]}.json',
|
|
120
|
+
json.dumps(edge['node']))
|
|
121
|
+
elif edge['node']['__typename'] == 'GraphSidecar':
|
|
122
|
+
r = session.get('https://i.instagram.com/api/v1/media/'
|
|
123
|
+
f'{edge["node"]["id"]}/info/')
|
|
124
|
+
item = r.json()['items'][0]
|
|
125
|
+
r.raise_for_status()
|
|
126
|
+
write_if_new(f'{edge["node"]["id"]}.json',
|
|
127
|
+
json.dumps(item))
|
|
128
|
+
for item in item['carousel_media']:
|
|
129
|
+
best = sorted(item['image_versions2']['candidates'],
|
|
130
|
+
key=lambda x: x['width'] * x['height'],
|
|
131
|
+
reverse=True)[0]
|
|
132
|
+
r = session.head(best['url'])
|
|
133
|
+
r.raise_for_status()
|
|
134
|
+
ext = get_extension(r.headers['content-type'])
|
|
135
|
+
name = f'{item["id"]}.{ext}'
|
|
136
|
+
if not isfile(name):
|
|
137
|
+
r = session.get(best['url'])
|
|
138
|
+
r.raise_for_status()
|
|
139
|
+
write_if_new(name, r.content, 'wb')
|
|
140
|
+
|
|
141
|
+
save_stuff(user_info['edge_owner_to_timeline_media']['edges'])
|
|
142
|
+
page_info = user_info['edge_owner_to_timeline_media']['page_info']
|
|
143
|
+
while page_info['has_next_page']:
|
|
144
|
+
params = dict(query_hash='69cba40317214236af40e7efa697781d',
|
|
145
|
+
variables=json.dumps(
|
|
146
|
+
dict(id=user_info['id'],
|
|
147
|
+
first=12,
|
|
148
|
+
after=page_info['end_cursor'])))
|
|
149
|
+
r = session.get('https://www.instagram.com/graphql/query/',
|
|
150
|
+
params=params)
|
|
151
|
+
r.raise_for_status()
|
|
152
|
+
media = r.json()['data']['user']['edge_owner_to_timeline_media']
|
|
153
|
+
page_info = media['page_info']
|
|
154
|
+
save_stuff(media['edges'])
|
|
155
|
+
sys.argv = [sys.argv[0]]
|
|
156
|
+
ydl_opts = yt_dlp.parse_options()[-1]
|
|
157
|
+
if len(video_urls) > 0:
|
|
158
|
+
with yt_dlp.YoutubeDL({
|
|
159
|
+
**ydl_opts,
|
|
160
|
+
**dict(http_headers=SHARED_HEADERS,
|
|
161
|
+
logger=YoutubeDLLogger(),
|
|
162
|
+
verbose=debug)
|
|
163
|
+
}) as ydl:
|
|
164
|
+
failed_urls = []
|
|
165
|
+
for url in video_urls:
|
|
166
|
+
if (not ydl.in_download_archive(
|
|
167
|
+
dict(id=url.split('/')[-1],
|
|
168
|
+
extractor_key='instagram'))
|
|
169
|
+
and not ydl.extract_info(url, ie_key='Instagram')):
|
|
170
|
+
failed_urls.append(url)
|
|
171
|
+
if len(failed_urls) > 0:
|
|
172
|
+
logger.error('Some video URIs failed. Check failed.txt.')
|
|
173
|
+
with open('failed.txt', 'w') as f:
|
|
174
|
+
for url in failed_urls:
|
|
175
|
+
f.write(f'{url}\n')
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from os.path import isfile
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from types import FrameType
|
|
4
|
+
from typing import (Literal, Optional, Union)
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
__all__ = ('UnknownMimetypeError', 'get_extension', 'write_if_new')
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def write_if_new(target: Union[Path, str],
|
|
15
|
+
content: Union[str, bytes],
|
|
16
|
+
mode: str = 'w') -> None:
|
|
17
|
+
if not isfile(target):
|
|
18
|
+
with click.open_file(str(target), mode) as f:
|
|
19
|
+
f.write(content)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class UnknownMimetypeError(Exception):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_extension(mimetype: str) -> Literal['png', 'jpg']:
|
|
27
|
+
if mimetype == 'image/jpeg':
|
|
28
|
+
return 'jpg'
|
|
29
|
+
if mimetype == 'image/png':
|
|
30
|
+
return 'png'
|
|
31
|
+
raise UnknownMimetypeError(mimetype)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class InterceptHandler(logging.Handler): # pragma: no cover
|
|
35
|
+
"""Intercept handler taken from Loguru's documentation."""
|
|
36
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
37
|
+
level: Union[str, int]
|
|
38
|
+
# Get corresponding Loguru level if it exists
|
|
39
|
+
try:
|
|
40
|
+
level = logger.level(record.levelname).name
|
|
41
|
+
except ValueError:
|
|
42
|
+
level = record.levelno
|
|
43
|
+
# Find caller from where originated the logged message
|
|
44
|
+
frame: Optional[FrameType] = logging.currentframe()
|
|
45
|
+
depth = 2
|
|
46
|
+
while frame and frame.f_code.co_filename == logging.__file__:
|
|
47
|
+
frame = frame.f_back
|
|
48
|
+
depth += 1
|
|
49
|
+
logger.opt(depth=depth, exception=record.exc_info).log(
|
|
50
|
+
level, record.getMessage())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def setup_log_intercept_handler() -> None: # pragma: no cover
|
|
54
|
+
"""Sets up Loguru to intercept records from the logging module."""
|
|
55
|
+
logging.basicConfig(handlers=(InterceptHandler(),), level=0)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def setup_logging(debug: Optional[bool] = False) -> None:
|
|
59
|
+
"""Shared function to enable logging."""
|
|
60
|
+
if debug: # pragma: no cover
|
|
61
|
+
setup_log_intercept_handler()
|
|
62
|
+
logger.enable('')
|
|
63
|
+
else:
|
|
64
|
+
logger.configure(handlers=(dict(
|
|
65
|
+
format='<level>{message}</level>',
|
|
66
|
+
level='INFO',
|
|
67
|
+
sink=sys.stderr,
|
|
68
|
+
),))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class YoutubeDLLogger:
|
|
72
|
+
def debug(self, message: str) -> None:
|
|
73
|
+
if message.startswith('[debug] '):
|
|
74
|
+
logger.debug(message)
|
|
75
|
+
else:
|
|
76
|
+
logger.info(message)
|
|
77
|
+
|
|
78
|
+
def info(self, _: str) -> None:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
def warning(self, message: str) -> None:
|
|
82
|
+
logger.warning(message)
|
|
83
|
+
|
|
84
|
+
def error(self, message: str) -> None:
|
|
85
|
+
logger.error(message)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: instagram-archiver
|
|
3
|
+
Version: 0.0.5
|
|
4
|
+
Summary: Archive Instagram content.
|
|
5
|
+
Home-page: https://github.com/Tatsh/instgram-archiver
|
|
6
|
+
Author: Andrew Udvare
|
|
7
|
+
Author-email: audvare@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
|
|
13
|
+
# Instagram Archiver
|
|
14
|
+
|
|
15
|
+
Tool to download data from an Instagram profile you have access to. It downloads the images, videos, and related metadata (stored as JSON files).
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```shell
|
|
20
|
+
pip install instagram-archiver
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```shell
|
|
26
|
+
ia USERNAME
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
If you like to see output, pass `--debug`.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
instagram_archiver/__init__.py
|
|
4
|
+
instagram_archiver/constants.py
|
|
5
|
+
instagram_archiver/ig_typing.py
|
|
6
|
+
instagram_archiver/main.py
|
|
7
|
+
instagram_archiver/utils.py
|
|
8
|
+
instagram_archiver.egg-info/PKG-INFO
|
|
9
|
+
instagram_archiver.egg-info/SOURCES.txt
|
|
10
|
+
instagram_archiver.egg-info/dependency_links.txt
|
|
11
|
+
instagram_archiver.egg-info/entry_points.txt
|
|
12
|
+
instagram_archiver.egg-info/requires.txt
|
|
13
|
+
instagram_archiver.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
instagram_archiver
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from setuptools import find_packages, setup
|
|
2
|
+
|
|
3
|
+
with open('README.md') as f:
|
|
4
|
+
setup(author='Andrew Udvare',
|
|
5
|
+
author_email='audvare@gmail.com',
|
|
6
|
+
description='Archive Instagram content.',
|
|
7
|
+
entry_points={'console_scripts': ['ia = instagram_archiver:main']},
|
|
8
|
+
extras_require={
|
|
9
|
+
'dev': [
|
|
10
|
+
'mypy', 'mypy-extensions', 'pylint', 'pylint-quotes', 'rope',
|
|
11
|
+
'types-requests>=2.25.9'
|
|
12
|
+
]
|
|
13
|
+
},
|
|
14
|
+
install_requires=[
|
|
15
|
+
'click>=8.0.0', 'loguru>=0.5.3', 'requests', 'yt-dlp>=2022.7.18'
|
|
16
|
+
],
|
|
17
|
+
license='MIT',
|
|
18
|
+
long_description=f.read(),
|
|
19
|
+
long_description_content_type='text/markdown',
|
|
20
|
+
name='instagram-archiver',
|
|
21
|
+
packages=find_packages(),
|
|
22
|
+
python_requires='>=3.9',
|
|
23
|
+
url='https://github.com/Tatsh/instgram-archiver',
|
|
24
|
+
version='0.0.5')
|