instagram-archiver 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of instagram-archiver might be problematic. Click here for more details.

@@ -1,103 +1,63 @@
1
- from typing import TYPE_CHECKING, Final, Mapping
1
+ """Constants."""
2
+ from __future__ import annotations
2
3
 
3
- from .utils import YoutubeDLLogger
4
+ __all__ = ('API_HEADERS', 'BROWSER_CHOICES', 'PAGE_FETCH_HEADERS', 'SHARED_HEADERS', 'USER_AGENT')
4
5
 
5
- __all__ = ('LOG_SCHEMA', 'RETRY_ABORT_NUM', 'SHARED_HEADERS', 'SHARED_YT_DLP_OPTIONS',
6
- 'YT_DLP_SLEEP_INTERVAL', 'USER_AGENT')
6
+ USER_AGENT = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
7
+ 'Chrome/137.0.0.0 Safari/537.36')
8
+ """
9
+ User agent.
7
10
 
8
- if TYPE_CHECKING:
9
- from yt_dlp import YoutubeDLOptions
10
-
11
- USER_AGENT: Final[str] = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
12
- 'Chrome/112.0.0.0 Safari/537.36')
13
- # Do not set the x-ig-d header as this will cause API calls to return 404
14
- SHARED_HEADERS: Final[Mapping[str, str]] = {
15
- 'accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,image/jxl,'
16
- 'image/avif,image/webp,image/apng,*/*;q=0.8,'
17
- 'application/signed-exchange;v=b3;q=0.9'),
18
- 'accept-language': 'en,en-GB;q=0.9,en-US;q=0.8',
11
+ :meta hide-value:
12
+ """
13
+ SHARED_HEADERS = {
14
+ 'accept': '*/*',
19
15
  'authority': 'www.instagram.com',
20
16
  'cache-control': 'no-cache',
21
17
  'dnt': '1',
22
18
  'pragma': 'no-cache',
23
- 'referer': 'https://www.instagram.com',
24
- 'upgrade-insecure-requests': '1',
25
19
  'user-agent': USER_AGENT,
26
- 'viewport-width': '2560',
27
- 'x-ig-app-id': '936619743392459'
20
+ # 'x-asbd-id': '359341',
21
+ # 'x-ig-app-id': '936619743392459',
22
+ }
23
+ """
24
+ Headers to use for requests.
25
+
26
+ :meta hide-value:
27
+ """
28
+ API_HEADERS = {
29
+ 'x-asbd-id': '359341',
30
+ 'x-ig-app-id': '936619743392459',
31
+ }
32
+ """
33
+ Headers to use for API requests.
34
+
35
+ :meta hide-value:
36
+ """
37
+ PAGE_FETCH_HEADERS = {
38
+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,'
39
+ 'image/apng,*/*;q=0.8',
40
+ 'dpr': '1.5',
41
+ 'sec-fetch-mode': 'navigate', # Definitely required.
42
+ 'viewport-width': '3840',
28
43
  }
29
- LOG_SCHEMA: Final[str] = '''
30
- CREATE TABLE log (
44
+ """
45
+ Headers to use for fetching HTML pages.
46
+
47
+ :meta hide-value:
48
+ """
49
+ LOG_SCHEMA = """CREATE TABLE log (
31
50
  url TEXT PRIMARY KEY NOT NULL,
32
51
  date TEXT DEFAULT CURRENT_TIMESTAMP NOT NULL
33
- );
34
- '''
35
- CALLS_PER_MINUTE: Final[int] = 10
36
- YT_DLP_SLEEP_INTERVAL: Final[float] = 60 / CALLS_PER_MINUTE
37
- #: Value taken from Instagram's JS under BootloaderConfig
38
- RETRY_ABORT_NUM: Final[int] = 2
39
- SHARED_YT_DLP_OPTIONS: 'YoutubeDLOptions' = {
40
- 'allowed_extractors': ['Instagram.*'],
41
- 'allsubtitles': True,
42
- 'cookiesfrombrowser': None,
43
- 'geo_bypass': True,
44
- 'getcomments': False,
45
- 'hls_use_mpegts': True,
46
- 'http_headers': SHARED_HEADERS,
47
- 'ignore_no_formats_error': True,
48
- 'ignoreerrors': True,
49
- 'logger': YoutubeDLLogger(),
50
- 'outtmpl': {
51
- 'default': '%(title).128s___src=%(extractor)s___id=%(id)s.%(ext)s',
52
- 'pl_thumbnail': ''
53
- },
54
- 'overwrites': False,
55
- 'max_sleep_interval': 6.0,
56
- 'merge_output_format': 'mkv',
57
- 'postprocessors': [{
58
- 'api': 'https://sponsor.ajay.app',
59
- 'categories': [
60
- 'preview', 'selfpromo', 'interaction', 'music_offtopic', 'sponsor', 'poi_highlight',
61
- 'intro', 'outro', 'filler', 'chapter'
62
- ],
63
- 'key': 'SponsorBlock',
64
- 'when': 'after_filter'
65
- }, {
66
- 'format': 'srt',
67
- 'key': 'FFmpegSubtitlesConvertor',
68
- 'when': 'before_dl'
69
- }, {
70
- 'already_have_subtitle': True,
71
- 'key': 'FFmpegEmbedSubtitle'
72
- }, {
73
- 'force_keyframes': False,
74
- 'key': 'ModifyChapters',
75
- 'remove_chapters_patterns': [],
76
- 'remove_ranges': [],
77
- 'remove_sponsor_segments': [],
78
- 'sponsorblock_chapter_title': '[SponsorBlock]: %(category_names)'
79
- }, {
80
- 'add_chapters': True,
81
- 'add_infojson': 'if_exists',
82
- 'add_metadata': True,
83
- 'key': 'FFmpegMetadata'
84
- }, {
85
- 'already_have_thumbnail': False,
86
- 'key': 'EmbedThumbnail'
87
- }, {
88
- 'key': 'FFmpegConcat',
89
- 'only_multi_video': True,
90
- 'when': 'playlist'
91
- }],
92
- 'restrictfilenames': True,
93
- 'skip_unavailable_fragments': True,
94
- 'sleep_interval': YT_DLP_SLEEP_INTERVAL,
95
- 'sleep_interval_requests': YT_DLP_SLEEP_INTERVAL,
96
- 'sleep_interval_subtitles': YT_DLP_SLEEP_INTERVAL,
97
- 'subtitleslangs': ['all'],
98
- 'verbose': False,
99
- 'writeautomaticsub': True,
100
- 'writesubtitles': True,
101
- 'writeinfojson': True,
102
- 'writethumbnail': True,
103
- }
52
+ );"""
53
+ """
54
+ Schema for log database.
55
+
56
+ :meta hide-value:
57
+ """
58
+ BROWSER_CHOICES = ('brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'firefox', 'safari')
59
+ """
60
+ Possible browser choices to get cookies from.
61
+
62
+ :meta hide-value:
63
+ """
@@ -1,66 +1,98 @@
1
- import sys
1
+ """Main application."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING
2
6
 
3
- from loguru import logger
4
- from requests.exceptions import RetryError
5
7
  import click
6
8
 
7
- from .client import AuthenticationError, Browser, InstagramClient
9
+ from .constants import BROWSER_CHOICES
10
+ from .profile_scraper import ProfileScraper
11
+ from .saved_scraper import SavedScraper
8
12
  from .utils import setup_logging
9
13
 
14
+ if TYPE_CHECKING:
15
+ from .typing import BrowserName
16
+
17
+ __all__ = ('main',)
10
18
 
11
- @click.command()
19
+
20
+ @click.command(context_settings={'help_option_names': ('-h', '--help')})
12
21
  @click.option('-o',
13
22
  '--output-dir',
14
- default=None,
15
- help='Output directory',
16
- type=click.Path(exists=True))
23
+ default='%(username)s',
24
+ help='Output directory.',
25
+ type=click.Path(file_okay=False, writable=True))
17
26
  @click.option('-b',
18
27
  '--browser',
19
28
  default='chrome',
20
- type=click.Choice(
21
- ['brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'firefox', 'safari']),
22
- help='Browser to read cookies from')
23
- @click.option('-p', '--profile', default='Default', help='Browser profile')
24
- @click.option('-d', '--debug', is_flag=True, help='Enable debug output')
25
- @click.option('--no-log', is_flag=True, help='Ignore log (re-fetch everything)')
29
+ type=click.Choice(BROWSER_CHOICES),
30
+ help='Browser to read cookies from.')
31
+ @click.option('-p', '--profile', default='Default', help='Browser profile.')
32
+ @click.option('-d', '--debug', is_flag=True, help='Enable debug output.')
33
+ @click.option('--no-log', is_flag=True, help='Ignore log (re-fetch everything).')
26
34
  @click.option('-C',
27
35
  '--include-comments',
28
36
  is_flag=True,
29
- help='Also download all comments (extends '
30
- 'download time significantly).')
37
+ help='Also download all comments (extends download time significantly).')
31
38
  @click.argument('username')
32
- def main(output_dir: str | None,
33
- browser: Browser,
34
- profile: str,
39
+ def main(output_dir: str,
35
40
  username: str,
41
+ browser: BrowserName = 'chrome',
42
+ profile: str = 'Default',
43
+ *,
36
44
  debug: bool = False,
37
- no_log: bool = False,
38
- include_comments: bool = False) -> None:
39
- setup_logging(debug)
45
+ include_comments: bool = False,
46
+ no_log: bool = False) -> None:
47
+ """Archive a profile's posts.""" # noqa: DOC501
48
+ setup_logging(debug=debug)
40
49
  try:
41
- with InstagramClient(username=username,
42
- output_dir=output_dir,
43
- browser_profile=profile,
44
- browser=browser,
45
- debug=debug,
46
- disable_log=no_log,
47
- comments=include_comments) as client:
50
+ with ProfileScraper(browser=browser,
51
+ browser_profile=profile,
52
+ comments=include_comments,
53
+ disable_log=no_log,
54
+ output_dir=(Path(output_dir % {'username': username})
55
+ if '%(username)s' in output_dir else Path(output_dir)),
56
+ username=username) as client:
48
57
  client.process()
49
- except RetryError as e:
50
- click.echo(
51
- 'Open your browser and login if necessary. If you are logged in and this continues, '
52
- 'try waiting at least 12 hours.',
53
- file=sys.stderr)
54
- raise click.Abort() from e
55
- except AuthenticationError as e:
56
- click.echo(
57
- 'You are probably not logged into Instagram in this browser profile or your '
58
- 'session has expired.',
59
- file=sys.stderr)
60
- raise click.Abort() from e
61
58
  except Exception as e:
62
- if debug:
63
- logger.exception(e)
64
- else:
65
- click.echo('Run with --debug for more information')
66
- raise click.Abort(f'{e} (run with --debug for more information)') from e
59
+ if isinstance(e, KeyboardInterrupt) or debug:
60
+ raise
61
+ click.echo('Run with --debug for more information.', err=True)
62
+ raise click.Abort from e
63
+
64
+
65
+ @click.command(context_settings={'help_option_names': ('-h', '--help')})
66
+ @click.option('-o',
67
+ '--output-dir',
68
+ default='.',
69
+ help='Output directory.',
70
+ type=click.Path(file_okay=False, writable=True))
71
+ @click.option('-b',
72
+ '--browser',
73
+ default='chrome',
74
+ type=click.Choice(BROWSER_CHOICES),
75
+ help='Browser to read cookies from.')
76
+ @click.option('-p', '--profile', default='Default', help='Browser profile.')
77
+ @click.option('-d', '--debug', is_flag=True, help='Enable debug output.')
78
+ @click.option('-C',
79
+ '--include-comments',
80
+ is_flag=True,
81
+ help='Also download all comments (extends download time significantly).')
82
+ @click.option('-u', '--unsave', is_flag=True, help='Unsave posts after successful archive.')
83
+ def save_saved_main(output_dir: str,
84
+ browser: BrowserName = 'chrome',
85
+ profile: str = 'Default',
86
+ *,
87
+ debug: bool = False,
88
+ include_comments: bool = False,
89
+ unsave: bool = False) -> None:
90
+ """Archive your saved posts.""" # noqa: DOC501
91
+ setup_logging(debug=debug)
92
+ try:
93
+ SavedScraper(browser, profile, output_dir, comments=include_comments).process(unsave=unsave)
94
+ except Exception as e:
95
+ if isinstance(e, KeyboardInterrupt) or debug:
96
+ raise
97
+ click.echo('Run with --debug for more information.', err=True)
98
+ raise click.Abort from e
@@ -0,0 +1,194 @@
1
+ """Instagram client."""
2
+ from __future__ import annotations
3
+
4
+ from contextlib import chdir
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, TypeVar, override
7
+ from urllib.parse import urlparse
8
+ import json
9
+ import logging
10
+ import sqlite3
11
+
12
+ from requests import HTTPError
13
+ from yt_dlp_utils import get_configured_yt_dlp
14
+
15
+ from .client import InstagramClient
16
+ from .constants import LOG_SCHEMA
17
+ from .typing import (
18
+ BrowserName,
19
+ WebProfileInfo,
20
+ XDTAPIV1FeedUserTimelineGraphQLConnectionContainer,
21
+ )
22
+ from .utils import SaveCommentsCheckDisabledMixin
23
+
24
+ if TYPE_CHECKING:
25
+ from types import TracebackType
26
+
27
+ __all__ = ('ProfileScraper',)
28
+
29
+ T = TypeVar('T')
30
+ log = logging.getLogger(__name__)
31
+
32
+
33
+ def _clean_url(url: str) -> str:
34
+ parsed = urlparse(url)
35
+ return f'https://{parsed.netloc}{parsed.path}'
36
+
37
+
38
+ class ProfileScraper(SaveCommentsCheckDisabledMixin, InstagramClient):
39
+ """The scraper."""
40
+ def __init__(self,
41
+ username: str,
42
+ *,
43
+ log_file: str | Path | None = None,
44
+ output_dir: str | Path | None = None,
45
+ disable_log: bool = False,
46
+ browser: BrowserName = 'chrome',
47
+ browser_profile: str = 'Default',
48
+ comments: bool = False) -> None:
49
+ """
50
+ Initialise ``ProfileScraper``.
51
+
52
+ Parameters
53
+ ----------
54
+ username : str
55
+ The username to scrape.
56
+ log_file : str | Path | None
57
+ The log file to use.
58
+ output_dir : str | Path | None
59
+ The output directory to save the posts to.
60
+ disable_log : bool
61
+ Whether to disable logging or not.
62
+ browser : BrowserName
63
+ The browser to use.
64
+ browser_profile : str
65
+ The browser profile to use.
66
+ comments : bool
67
+ Whether to save comments or not.
68
+ """
69
+ super().__init__(browser, browser_profile)
70
+ self._no_log = disable_log
71
+ self._output_dir = Path(output_dir or Path.cwd() / username)
72
+ self._output_dir.mkdir(parents=True, exist_ok=True)
73
+ self._log_db = Path(log_file or self._output_dir / '.log.db')
74
+ self._connection = sqlite3.connect(self._log_db)
75
+ self._cursor = self._connection.cursor()
76
+ self._setup_db()
77
+ self._username = username
78
+ self.should_save_comments = comments
79
+
80
+ def _setup_db(self) -> None:
81
+ if self._no_log:
82
+ return
83
+ existed = self._log_db.exists()
84
+ if not existed or (existed and self._log_db.stat().st_size == 0):
85
+ log.debug('Creating schema.')
86
+ self._cursor.execute(LOG_SCHEMA)
87
+
88
+ @override
89
+ def save_to_log(self, url: str) -> None:
90
+ if self._no_log:
91
+ return
92
+ self._cursor.execute('INSERT INTO log (url) VALUES (?)', (_clean_url(url),))
93
+ self._connection.commit()
94
+
95
+ @override
96
+ def is_saved(self, url: str) -> bool:
97
+ if self._no_log:
98
+ return False
99
+ self._cursor.execute('SELECT COUNT(url) FROM log WHERE url = ?', (_clean_url(url),))
100
+ count: int
101
+ count, = self._cursor.fetchone()
102
+ return count == 1
103
+
104
+ @override
105
+ def __exit__(self, _: type[BaseException] | None, __: BaseException | None,
106
+ ___: TracebackType | None) -> None:
107
+ """Clean up."""
108
+ self._cursor.close()
109
+ self._connection.close()
110
+
111
+ def process(self) -> None:
112
+ """Process posts."""
113
+ with chdir(self._output_dir):
114
+ self.get_text(f'https://www.instagram.com/{self._username}/')
115
+ self.add_csrf_token_header()
116
+ r = self.get_json('https://i.instagram.com/api/v1/users/web_profile_info/',
117
+ params={'username': self._username},
118
+ cast_to=WebProfileInfo)
119
+ with Path('web_profile_info.json').open('w', encoding='utf-8') as f:
120
+ json.dump(r, f, indent=2, sort_keys=True)
121
+ user_info = r['data']['user']
122
+ if not self.is_saved(user_info['profile_pic_url_hd']):
123
+ with Path('profile_pic.jpg').open('wb') as f:
124
+ f.writelines(
125
+ self.session.get(user_info['profile_pic_url_hd'],
126
+ stream=True).iter_content(chunk_size=512))
127
+ self.save_to_log(user_info['profile_pic_url_hd'])
128
+ try:
129
+ for item in self.highlights_tray(user_info['id'])['tray']:
130
+ self.add_video_url('https://www.instagram.com/stories/highlights/'
131
+ f'{item["id"].split(":")[-1]}/')
132
+ except HTTPError:
133
+ log.exception('Failed to get highlights data.')
134
+ self.save_edges(user_info['edge_owner_to_timeline_media']['edges'])
135
+ d = self.graphql_query(
136
+ {
137
+ 'data': {
138
+ 'count': 12,
139
+ 'include_reel_media_seen_timestamp': True,
140
+ 'include_relationship_info': True,
141
+ 'latest_besties_reel_media': True,
142
+ 'latest_reel_media': True
143
+ },
144
+ 'username': self._username,
145
+ '__relay_internal__pv__PolarisIsLoggedInrelayprovider': True,
146
+ '__relay_internal__pv__PolarisShareSheetV3relayprovider': True,
147
+ },
148
+ cast_to=XDTAPIV1FeedUserTimelineGraphQLConnectionContainer)
149
+ if not d:
150
+ log.error('First GraphQL query failed.')
151
+ else:
152
+ self.save_edges(d['xdt_api__v1__feed__user_timeline_graphql_connection']['edges'])
153
+ page_info = d['xdt_api__v1__feed__user_timeline_graphql_connection']['page_info']
154
+ while page_info['has_next_page']:
155
+ d = self.graphql_query(
156
+ {
157
+ 'after': page_info['end_cursor'],
158
+ 'before': None,
159
+ 'data': {
160
+ 'count': 12,
161
+ 'include_reel_media_seen_timestamp': True,
162
+ 'include_relationship_info': True,
163
+ 'latest_besties_reel_media': True,
164
+ 'latest_reel_media': True,
165
+ },
166
+ 'first': 12,
167
+ 'last': None,
168
+ 'username': self._username,
169
+ '__relay_internal__pv__PolarisIsLoggedInrelayprovider': True,
170
+ '__relay_internal__pv__PolarisShareSheetV3relayprovider': True,
171
+ },
172
+ cast_to=XDTAPIV1FeedUserTimelineGraphQLConnectionContainer)
173
+ if not d:
174
+ break
175
+ page_info = d['xdt_api__v1__feed__user_timeline_graphql_connection'][
176
+ 'page_info']
177
+ self.save_edges(
178
+ d['xdt_api__v1__feed__user_timeline_graphql_connection']['edges'])
179
+ if self.video_urls:
180
+ with get_configured_yt_dlp() as ydl:
181
+ while self.video_urls and (url := self.video_urls.pop()):
182
+ if self.is_saved(url):
183
+ log.info('`%s` is already saved.', url)
184
+ continue
185
+ if ydl.extract_info(url):
186
+ log.info('Extracting `%s`.', url)
187
+ self.save_to_log(url)
188
+ else:
189
+ self.failed_urls.add(url)
190
+ if self.failed_urls:
191
+ log.warning('Some video URIs failed. Check failed.txt.')
192
+ with Path('failed.txt').open('w', encoding='utf-8') as f:
193
+ for url in self.failed_urls:
194
+ f.write(f'{url}\n')
File without changes
@@ -0,0 +1,78 @@
1
+ """Saved posts scraper."""
2
+ from __future__ import annotations
3
+
4
+ from contextlib import chdir
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any
7
+ import logging
8
+
9
+ from .client import InstagramClient
10
+ from .constants import API_HEADERS, PAGE_FETCH_HEADERS
11
+ from .utils import SaveCommentsCheckDisabledMixin
12
+
13
+ if TYPE_CHECKING:
14
+
15
+ from collections.abc import Iterable
16
+
17
+ from .typing import BrowserName
18
+
19
+ __all__ = ('SavedScraper',)
20
+ log = logging.getLogger(__name__)
21
+
22
+
23
+ class SavedScraper(InstagramClient, SaveCommentsCheckDisabledMixin):
24
+ """Scrape saved posts."""
25
+ def __init__(
26
+ self,
27
+ browser: BrowserName = 'chrome',
28
+ browser_profile: str = 'Default',
29
+ output_dir: str | Path | None = None,
30
+ *,
31
+ comments: bool = False,
32
+ ) -> None:
33
+ """
34
+ Initialise ``SavedScraper``.
35
+
36
+ Parameters
37
+ ----------
38
+ browser : BrowserName
39
+ The browser to use.
40
+ browser_profile : str
41
+ The browser profile to use.
42
+ output_dir : str | Path | None
43
+ The output directory to save the posts to.
44
+ comments : bool
45
+ Whether to save comments or not.
46
+ """
47
+ super().__init__(browser, browser_profile)
48
+ self._output_dir = Path(output_dir or Path.cwd() / '@@saved-posts@@')
49
+ Path(self._output_dir).mkdir(parents=True, exist_ok=True)
50
+ self.should_save_comments = comments
51
+
52
+ def unsave(self, items: Iterable[str]) -> None:
53
+ """Unsave saved posts."""
54
+ for item in items:
55
+ log.info('Unsaving %s.', item)
56
+ self.session.post(f'https://www.instagram.com/web/save/{item}/unsave/',
57
+ headers=API_HEADERS)
58
+
59
+ def process(self, *, unsave: bool = False) -> None:
60
+ """Process the saved posts."""
61
+ with chdir(self._output_dir):
62
+ self.add_csrf_token_header()
63
+ self.session.get('https://www.instagram.com/', headers=PAGE_FETCH_HEADERS)
64
+ feed = self.get_json('https://www.instagram.com/api/v1/feed/saved/posts/',
65
+ cast_to=dict[str, Any])
66
+ self.save_edges({
67
+ 'node': {
68
+ '__typename': 'XDTMediaDict',
69
+ 'id': item['media']['id'],
70
+ 'code': item['media']['code'],
71
+ 'owner': item['media']['owner'],
72
+ 'video_dash_manifest': item['media'].get('video_dash_manifest')
73
+ }
74
+ } for item in feed['items'])
75
+ if unsave:
76
+ self.unsave(item['media']['code'] for item in feed['items'])
77
+ if feed.get('more_available'):
78
+ log.warning('Unhandled pagination.')